From 372569ecd8df70bcbd9e7bd9e1657149572f11f1 Mon Sep 17 00:00:00 2001 From: Winda0001 <13912795021@163.com> Date: Wed, 22 Apr 2026 19:45:01 +0800 Subject: [PATCH 1/4] chore: add .worktrees/ to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index fc2aac7..f7fdf35 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ logs/ .claude/ gemini文本生图.py nul +.worktrees/ From ae78a609e538b3d710a8f3ed38e30a7e032e28bc Mon Sep 17 00:00:00 2001 From: Winda0001 <13912795021@163.com> Date: Fri, 8 May 2026 18:43:38 +0800 Subject: [PATCH 2/4] chore: all tests passing after 4/20-4/22 redesign - Task 7: appraise_agent.py GRADE scoring with SR/MA/NMA included_study_type, upgrade blocking on serious bias, confounding_bias_mitigates - Task 8: appraise_agent.txt prompt updated with included_study_type, confounding_bias_mitigates, upgrade_blocked_by_bias fields - Task 9: apply_agent.py route_type injection, structured GRADE summary, inconsistency enforcement - Task 10: apply_agent.txt updated with {route_type} input - Task 11-14: judge prompts rewritten with Gate+Rubrics architecture - Task 15: assess_judge.txt updated with route_type/route_confidence/ ebm_query inputs, route_confidence_noted output - Task 16: judge_llm.py Gate+Rubrics scoring, RUBRIC_WEIGHTS, _check_gates, _score_rubrics - Task 17: tests/test_judge_rubrics.py (6 tests), tests/test_integration_routing.py (4 tests) - Task 18: 15/15 tests passing --- .../plans/2026-04-20-22-full-redesign.md | 321 +++++ ...2026-04-22-judge-rubrics-implementation.md | 1276 +++++++++++++++++ .../2026-04-20-acquire-agent-redesign.md | 249 ++++ .../2026-04-20-acquire-judge-redesign.md | 183 +++ .../specs/2026-04-20-apply-agent-alignment.md | 235 +++ .../specs/2026-04-20-apply-judge-redesign.md | 141 ++ .../2026-04-20-appraise-agent-grade-fix.md | 187 +++ .../2026-04-20-appraise-judge-redesign.md | 130 ++ .../specs/2026-04-20-ask-agent-redesign.md | 207 +++ .../specs/2026-04-20-ask-judge-redesign.md | 428 ++++++ .../specs/2026-04-20-assess-judge-redesign.md | 139 ++ .../2026-04-22-judge-rubrics-redesign.md | 419 ++++++ src/agents/acquire_agent.py | 232 ++- src/agents/apply_agent.py | 108 +- src/agents/appraise_agent.py | 63 +- src/agents/ask_agent.py | 420 +++++- src/config/prompts/apply_agent.txt | 8 + src/config/prompts/appraise_agent.txt | 21 + src/config/prompts/ask/diag_step1.txt | 56 + src/config/prompts/ask/diag_step2.txt | 75 + src/config/prompts/ask/direct_answer.txt | 54 + src/config/prompts/ask/ebm_peo.txt | 74 + src/config/prompts/ask/ebm_pico.txt | 66 + src/config/prompts/ask/ebm_pird.txt | 75 + src/config/prompts/ask/ebm_prognosis.txt | 76 + src/config/prompts/ask/router.txt | 84 ++ src/config/prompts/judge/acquire_judge.txt | 206 +-- src/config/prompts/judge/apply_judge.txt | 163 ++- src/config/prompts/judge/appraise_judge.txt | 146 +- src/config/prompts/judge/ask_judge.txt | 141 +- src/config/prompts/judge/assess_judge.txt | 17 +- src/coordinator/coordinator.py | 17 + src/judge/judge_llm.py | 217 ++- src/state/schema.py | 22 + src/tools/pubmed_api.py | 102 +- tests/test_appraise_grade.py | 92 ++ tests/test_integration_routing.py | 206 +++ tests/test_judge_rubrics.py | 141 ++ 38 files changed, 6356 insertions(+), 441 deletions(-) create mode 100644 docs/superpowers/plans/2026-04-20-22-full-redesign.md create mode 100644 docs/superpowers/plans/2026-04-22-judge-rubrics-implementation.md create mode 100644 docs/superpowers/specs/2026-04-20-acquire-agent-redesign.md create mode 100644 docs/superpowers/specs/2026-04-20-acquire-judge-redesign.md create mode 100644 docs/superpowers/specs/2026-04-20-apply-agent-alignment.md create mode 100644 docs/superpowers/specs/2026-04-20-apply-judge-redesign.md create mode 100644 docs/superpowers/specs/2026-04-20-appraise-agent-grade-fix.md create mode 100644 docs/superpowers/specs/2026-04-20-appraise-judge-redesign.md create mode 100644 docs/superpowers/specs/2026-04-20-ask-agent-redesign.md create mode 100644 docs/superpowers/specs/2026-04-20-ask-judge-redesign.md create mode 100644 docs/superpowers/specs/2026-04-20-assess-judge-redesign.md create mode 100644 docs/superpowers/specs/2026-04-22-judge-rubrics-redesign.md create mode 100644 src/config/prompts/ask/diag_step1.txt create mode 100644 src/config/prompts/ask/diag_step2.txt create mode 100644 src/config/prompts/ask/direct_answer.txt create mode 100644 src/config/prompts/ask/ebm_peo.txt create mode 100644 src/config/prompts/ask/ebm_pico.txt create mode 100644 src/config/prompts/ask/ebm_pird.txt create mode 100644 src/config/prompts/ask/ebm_prognosis.txt create mode 100644 src/config/prompts/ask/router.txt create mode 100644 tests/test_appraise_grade.py create mode 100644 tests/test_integration_routing.py create mode 100644 tests/test_judge_rubrics.py diff --git a/docs/superpowers/plans/2026-04-20-22-full-redesign.md b/docs/superpowers/plans/2026-04-20-22-full-redesign.md new file mode 100644 index 0000000..64a79d8 --- /dev/null +++ b/docs/superpowers/plans/2026-04-20-22-full-redesign.md @@ -0,0 +1,321 @@ +# 4/20-4/22 全量重设计实现计划 + +> **For agentic workers:** Use superpowers:subagent-driven-development to implement task-by-task. + +**Goal:** Ask 路由重设计、Acquire PMC+RAG、Appraise GRADE 修正、Apply 对齐、Judge Gate+Rubrics 架构重写。 + +**原则:** 每个 Task 先写失败测试(如适用),再改代码,再验证,不需要commit。 + +--- + +## 文件改动清单 + +| 文件 | 操作 | +|---|---| +| src/state/schema.py | 新增 EBMQuery;WorkflowState 路由字段;Evidence.has_full_text | +| src/config/prompts/ask/ | 新建目录 + 8 个 prompt 文件 | +| src/agents/ask_agent.py | 重写:路由→分支→统一输出 | +| src/coordinator/coordinator.py | direct_answer 提前终止 | +| src/tools/pubmed_api.py | 新增 fetch_pmc_full_text | +| src/agents/acquire_agent.py | EBMQuery 适配;PMC 全文;BM25+Embedding RAG | +| src/agents/appraise_agent.py | _compute_grade 重写;SR 动态初始分;升级阻断 | +| src/config/prompts/appraise_agent.txt | 新增 included_study_type、confounding_bias_mitigates | +| src/agents/apply_agent.py | route_type 注入;结构化 GRADE;inconsistency 强制规则 | +| src/config/prompts/apply_agent.txt | 路由维度检查;结构化 GRADE 输入变量 | +| src/config/prompts/judge/ask_judge.txt | 重写:Gate+Rubrics | +| src/config/prompts/judge/acquire_judge.txt | 重写:Gate+Rubrics | +| src/config/prompts/judge/appraise_judge.txt | 重写:Gate+Rubrics | +| src/config/prompts/judge/apply_judge.txt | 重写:Gate+Rubrics | +| src/config/prompts/judge/assess_judge.txt | 新增路由字段输入;route_confidence_noted | +| src/judge/judge_llm.py | Gate 检查;Rubric 评分;RUBRIC_WEIGHTS | +| tests/test_judge_rubrics.py | 新建:Gate+Rubrics 单元测试 | +| tests/test_integration_routing.py | 新建:路由集成测试 | + +--- + +## Task 1: schema.py — EBMQuery + routing fields + Evidence.has_full_text + +**Files:** `src/state/schema.py` + +- [ ] 在 PICOQuery 之后插入 EBMQuery dataclass(字段:query_type, patient, primary_focus, outcome, keywords, comparator=None, reference_standard=None, time_horizon=None) +- [ ] Evidence 新增 `has_full_text: bool = False` +- [ ] WorkflowState 新增:route_type, route_confidence, direct_answer_output, ebm_query, sub_pico_queries, sub_question_index, sub_question_total +- [ ] 验证:`python3 -c "from src.state.schema import EBMQuery, WorkflowState, Evidence; print('OK')"` +- [ ] `git commit -m "feat(schema): EBMQuery, routing fields, Evidence.has_full_text"` + +--- + +## Task 2: Ask prompt files — create src/config/prompts/ask/ with 8 files + +**Files:** `src/config/prompts/ask/`(新建目录) + +| 文件 | 输入变量 | 输出 JSON 关键字段 | +|---|---|---| +| router.txt | {question} | route_type, reasoning | +| direct_answer.txt | {question} | answer, source, disclaimer | +| diag_step1.txt | {question} | clinical_features[], differential_diagnoses[](最多3个) | +| diag_step2.txt | {clinical_features}, {single_diagnosis} | EBMQuery 字段 | +| ebm_pico.txt | {question}, {backtrack_context} | EBMQuery(query_type=pico) | +| ebm_pird.txt | {question}, {backtrack_context} | EBMQuery(query_type=pird) | +| ebm_peo.txt | {question}, {backtrack_context} | EBMQuery(query_type=peo) | +| ebm_prognosis.txt | {question}, {backtrack_context} | EBMQuery(query_type=prognosis) | + +router.txt 的 direct_answer 触发条件须同时满足:(1) 要求立即操作性指导;(2) 延迟会危及生命;(3) 答案来自公认标准流程(BLS/ACLS)。 + +- [ ] 创建目录并写入 8 个文件,每个文件含 Role + 输入变量 + 输出 JSON 格式,不含示例数据 +- [ ] 验证所有文件存在且非空 +- [ ] `git commit -m "feat(ask-prompts): 8 routing prompt files"` + +--- + +## Task 3: ask_agent.py — rewrite with routing logic + +**Files:** `src/agents/ask_agent.py` + +``` +__init__: 从 src/config/prompts/ask/ 加载 8 个 prompt 到 self._prompts dict +execute(state): + route = _call("router", question) + if route == "direct_answer": return {direct_answer_output, _ask_direct_answer: True} + if route == "diagnostic_reasoning": + step1 = _call("diag_step1"); sub_queries = [_call("diag_step2", ...) for diag in differentials] + return {sub_pico_queries, ebm_query: sub_queries[0]} + ebm_dict = _call(route_type, question, backtrack_context) + return {ebm_query: EBMQuery(**ebm_dict), pico_query: PICOQuery(...), route_type} +``` + +- [ ] 重写 ask_agent.py +- [ ] 验证:`python3 -c "from src.agents.ask_agent import AskAgent; print('OK')"` +- [ ] `git commit -m "feat(ask-agent): routing with direct_answer/diag/ebm_* branches"` + +--- + +## Task 4: coordinator.py — direct_answer early termination + +**Files:** `src/coordinator/coordinator.py` + +```python +if agent_name == "Ask" and result.get("_ask_direct_answer"): + state.update(result); state["should_terminate"] = True; return state +``` + +- [ ] 在 execute_agent 的 result = agent.execute(state) 之后插入上述代码 +- [ ] 验证:`python3 -c "from src.coordinator.coordinator import Coordinator; print('OK')"` +- [ ] `git commit -m "feat(coordinator): early termination for direct_answer route"` + +--- + +## Task 5: pubmed_api.py — add fetch_pmc_full_text via PMC OA BioC JSON API + +**Files:** `src/tools/pubmed_api.py` + +``` +URL: https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmcid}/unicode +timeout=10,任何异常返回 None +解析 documents[0].passages[*].text,拼接返回 +``` + +- [ ] 在文件末尾追加 fetch_pmc_full_text 函数 +- [ ] 验证:`python3 -c "from src.tools.pubmed_api import fetch_pmc_full_text; print('OK')"` +- [ ] `git commit -m "feat(pubmed-api): add fetch_pmc_full_text via PMC OA BioC JSON"` + +--- + +## Task 6: acquire_agent.py — EBMQuery support + PMC full text + BM25/Embedding RAG + +**Files:** `src/agents/acquire_agent.py` + +- [ ] 新增 _FILTER_BY_ROUTE_TYPE(ebm_pico→HSSS, ebm_pird→DTA, ebm_peo/prognosis→OBSERVATIONAL) +- [ ] 新增线程安全 _get_embedding_model()(懒加载 all-MiniLM-L6-v2) +- [ ] 新增 _fetch_full_texts(candidates):ThreadPoolExecutor(max_workers=8),写入 evidence.full_text 和 evidence.has_full_text +- [ ] 新增 _rag_extract(evidence, query_terms):BM25 top-8 → Embedding rerank top-3,返回 (key_sentences, score) +- [ ] 更新 execute:读 ebm_query(优先)或 pico_query(兼容);fetch full texts;RAG extract;full_text 文章排前 +- [ ] 验证:`python3 -c "from src.agents.acquire_agent import AcquireAgent; print('OK')"` +- [ ] `git commit -m "feat(acquire): EBMQuery routing, PMC full-text, BM25+Embedding RAG"` + +--- + +## Task 7: appraise_agent.py — fix _compute_grade: SR dynamic initial score, upgrade blocked by SERIOUS bias, cap observational at Moderate + +**Files:** `src/agents/appraise_agent.py`, `tests/test_appraise_grade.py` + +测试用例(5个):SR+RCT→High;SR+OBSERVATIONAL→Low;COHORT+SERIOUS+全升级→Very Low(阻断);COHORT+NOT_SERIOUS+全升级→Moderate(cap);CROSS_SECTIONAL+全升级→Low(不在升级类型中) + +``` +_SR_INITIAL_POINTS = {"RCT": 4, "OBSERVATIONAL": 2, "MIXED": 3, "UNKNOWN": 3} +_SR_TYPES = {"SYSTEMATIC_REVIEW", "META_ANALYSIS", "NMA"} +_UPGRADE_STUDY_TYPES = {"COHORT", "CASE_CONTROL"} +``` + +- [ ] 先写测试,确认 FAIL +- [ ] 重写 _compute_grade:SR 查 _SR_INITIAL_POINTS;升级仅对 _UPGRADE_STUDY_TYPES 且 NOT_SERIOUS 时生效;升级后 cap at min(points, 3) +- [ ] 确认测试 PASS +- [ ] `git commit -m "feat(appraise): dynamic SR initial score, upgrade blocked by bias, cap at Moderate"` + +--- + +## Task 8: appraise_agent.txt — add included_study_type, confounding_bias_mitigates fields + +**Files:** `src/config/prompts/appraise_agent.txt` + +- [ ] 在"研究类型"节末尾新增 included_study_type 说明(仅 SR/MA/NMA 填写;取值 RCT/OBSERVATIONAL/MIXED/UNKNOWN) +- [ ] 在"升级因素"节新增 confounding_bias_mitigates(YES/NO/NA)和 upgrade_blocked_by_bias(true/false);注明 SERIOUS 偏倚时升级被阻断 +- [ ] 在 JSON 输出模板中新增这两个字段 +- [ ] 验证:`python3 -c "t=open('src/config/prompts/appraise_agent.txt').read(); assert 'included_study_type' in t; print('OK')"` +- [ ] `git commit -m "feat(prompt/appraise): included_study_type, confounding_bias_mitigates"` + +--- + +## Task 9: apply_agent.py — route_type injection, structured GRADE, inconsistency enforcement + +**Files:** `src/agents/apply_agent.py`, `tests/test_apply_agent.py` + +测试用例(4个):_format_ebm_query pico 含 "Intervention:";pird 含 "Index Test:";_summarize_downgrade_factors 全 NOT_SERIOUS→固定字符串;有 SERIOUS→含因素名 + +- [ ] 新增模块级函数:_format_ebm_query, _format_pico_query, _summarize_downgrade_factors +- [ ] 更新 execute:注入 route_type/query_description/key_downgrade_factors/has_serious_inconsistency +- [ ] 强制规则:overall_grade in (Very Low, Low) + LLM 给 Strong → 降为 Weak;has_serious_inconsistency + Strong → 降为 Weak +- [ ] route_confidence == "low" 时在 caveats 追加警告 +- [ ] 先写测试确认 FAIL,修改代码确认 PASS +- [ ] `git commit -m "feat(apply): route_type injection, structured GRADE, inconsistency enforcement"` + +--- + +## Task 10: apply_agent.txt — add route_type dimension check, structured GRADE input variables + +**Files:** `src/config/prompts/apply_agent.txt` + +- [ ] 新增输入变量:{route_type}, {query_description}, {overall_grade}, {downgrade_factors}, {consistency_flag} +- [ ] 在 prompt 开头新增 Step 0:根据 {route_type} 说明当前问题框架(治疗/诊断/病因/预后) +- [ ] 在推荐强度规则中明确写入:SERIOUS inconsistency → 不得给 Strong +- [ ] 验证:`python3 -c "t=open('src/config/prompts/apply_agent.txt').read(); assert '{route_type}' in t; print('OK')"` +- [ ] `git commit -m "feat(prompt/apply): route_type dimension check, structured GRADE input"` + +--- + +## Task 11: ask_judge.txt — rewrite Gate+Rubrics + +**Files:** `src/config/prompts/judge/ask_judge.txt` + +Gate(任一触发 → 直接 FAIL):intent_distorted == YES;keywords_english_medical == NO + +| 维度 | 权重 | +|---|---| +| pico_completeness(P/I/O 均 YES) | 0.30 | +| keyword_quality(MeSH + 同义词) | 0.25 | +| route_correctness(route_type 与问题匹配) | 0.25 | +| clarity(表述清晰度) | 0.20 | + +输出 JSON 新增:gate_passed: bool, rubric_scores: {...}, weighted_score: float + +- [ ] 重写文件 +- [ ] `git commit -m "feat(judge/ask): Gate+Rubrics architecture"` + +--- + +## Task 12: acquire_judge.txt — rewrite Gate+Rubrics + +**Files:** `src/config/prompts/judge/acquire_judge.txt` + +Gate:search_terms_valid == NO + +| 维度 | 权重 | +|---|---| +| evidence_quality(best_study_type) | 0.35 | +| pico_match | 0.35 | +| selection_quality(listwise 合理性) | 0.30 | + +- [ ] 重写文件 +- [ ] `git commit -m "feat(judge/acquire): Gate+Rubrics architecture"` + +--- + +## Task 13: appraise_judge.txt — rewrite Gate+Rubrics + +**Files:** `src/config/prompts/judge/appraise_judge.txt` + +Gate:study_type_correct == NO + +| 维度 | 权重 | +|---|---| +| downgrade_factors(分类合理性) | 0.35 | +| computed_grade(合理性) | 0.35 | +| upgrade_factors(含 confounding_bias_mitigates 审计) | 0.30 | + +- [ ] 重写文件 +- [ ] `git commit -m "feat(judge/appraise): Gate+Rubrics architecture"` + +--- + +## Task 14: apply_judge.txt — rewrite Gate+Rubrics + +**Files:** `src/config/prompts/judge/apply_judge.txt` + +Gate:recommendation_based_on_evidence == NO + +| 维度 | 权重 | +|---|---| +| grounding(推荐-证据匹配) | 0.35 | +| strength_match(推荐强度 vs GRADE) | 0.35 | +| route_dimension(route_dimension_correct) | 0.15 | +| actionability(临床可操作性) | 0.15 | + +新增输入变量 {route_type},用于判断推荐是否符合当前问题框架。 + +- [ ] 重写文件 +- [ ] `git commit -m "feat(judge/apply): Gate+Rubrics, route_dimension audit"` + +--- + +## Task 15: assess_judge.txt — add route_type/route_confidence/ebm_query inputs, route_confidence_noted output field + +**Files:** `src/config/prompts/judge/assess_judge.txt` + +- [ ] 输入新增:{route_type}, {route_confidence}, {ebm_query_description} +- [ ] ask_to_acquire_link 审计新增:检索词是否覆盖 {route_type} 对应的关键维度 +- [ ] 新增审计项 route_confidence_noted(若 route_confidence=low,输出是否包含不确定性说明) +- [ ] 输出 JSON 新增 `route_confidence_noted: "YES | NO | NA"` +- [ ] `git commit -m "feat(judge/assess): route_type/ebm_query inputs, route_confidence_noted"` + +--- + +## Task 16: judge_llm.py — add _check_gates, _score_rubrics, RUBRIC_WEIGHTS; update _score_ask/acquire/appraise/apply + +**Files:** `src/judge/judge_llm.py`, `tests/test_judge_rubrics.py` + +```python +RUBRIC_WEIGHTS = { + "ask": {"pico_completeness": 0.30, "keyword_quality": 0.25, "route_correctness": 0.25, "clarity": 0.20}, + "acquire": {"evidence_quality": 0.35, "pico_match": 0.35, "selection_quality": 0.30}, + "appraise": {"downgrade_factors": 0.35, "computed_grade": 0.35, "upgrade_factors": 0.30}, + "apply": {"grounding": 0.35, "strength_match": 0.35, "route_dimension": 0.15, "actionability": 0.15}, +} +``` + +测试用例(5个):Gate 触发时 _score_ask 返回 0.0;Gate 通过时按权重正确计算;_check_gates("ask", {"intent_distorted": "YES"}) 返回 False;_check_gates("apply", {"recommendation_based_on_evidence": "NO"}) 返回 False;全 YES rubric_scores 返回 1.0 + +- [ ] 先写测试,确认 FAIL +- [ ] 新增 _check_gates, _score_rubrics;更新 _score_ask/acquire/appraise/apply;更新 _prepare_context 传入路由字段 +- [ ] 确认测试 PASS +- [ ] `git commit -m "feat(judge-llm): Gate+Rubrics scoring, RUBRIC_WEIGHTS"` + +--- + +## Task 17: Integration tests — tests/test_integration_routing.py (mock LLM, test direct_answer/ebm_pico/ebm_pird routing) + +**Files:** `tests/test_integration_routing.py`(新建) + +- [ ] direct_answer 路由 → should_terminate=True,direct_answer_output 非空 +- [ ] ebm_pico 路由 → ebm_query.query_type == "pico",pico_query 兼容字段存在 +- [ ] ebm_pird 路由 → ebm_query.query_type == "pird" +- [ ] 旧 pico_query 兼容 → Acquire 正常运行 +- [ ] `git commit -m "test(integration): routing flow with mock LLM"` + +--- + +## Task 18: Full regression — run all tests, fix failures + +- [ ] `python3 -m pytest tests/ -v --tb=short 2>&1 | tail -30` +- [ ] 确认无 FAILED +- [ ] 如有失败,逐一修复后重新运行 +- [ ] `git commit -m "chore: all tests passing after 4/20-4/22 redesign"` diff --git a/docs/superpowers/plans/2026-04-22-judge-rubrics-implementation.md b/docs/superpowers/plans/2026-04-22-judge-rubrics-implementation.md new file mode 100644 index 0000000..914249b --- /dev/null +++ b/docs/superpowers/plans/2026-04-22-judge-rubrics-implementation.md @@ -0,0 +1,1276 @@ +# Judge Rubrics 重设计实现计划 + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** 将 Ask/Acquire/Appraise/Apply 四个阶段的 Judge 改造为 Gate + Weighted Rubrics 架构,使评分逻辑对 LLM 和人工标注者均透明可验证。 + +**Architecture:** LLM Judge 输出每条 rubric 的 YES/PARTIAL/NO;Python 侧先做 Gate 检查(任一 NO → 立即 fail),再按 Critical=3/Major=2/Minor=1 权重计算总分,≥0.7 → pass。决策模型读取 gate 失败项和低分 rubric 群生成定向 retry 指令。 + +**Tech Stack:** Python 3.10+, LangChain LLM, pytest + +--- + +## 文件改动清单 + +| 文件 | 操作 | 说明 | +|---|---|---| +| `src/config/prompts/judge/ask_judge.txt` | 重写 | Gate + rubric 结构,动态路由段注入 | +| `src/config/prompts/judge/acquire_judge.txt` | 重写 | Gate + rubric 结构,keywords 评分迁入 | +| `src/config/prompts/judge/appraise_judge.txt` | 重写 | Gate + rubric 结构,新增升级因素审计 | +| `src/config/prompts/judge/apply_judge.txt` | 重写 | Gate + rubric 结构,route_dimension 审计 | +| `src/judge/judge_llm.py` | 修改 | `_score_*` 函数全部重写;新增 `_check_gates`;`STAGE_WEIGHTS` 替换为 rubric 权重表;Appraise 新增 Layer 1 Python 校验 | +| `tests/test_judge_rubrics.py` | 新建 | 各阶段 rubric 评分单元测试 | + +--- + +## Task 1: 重写 `ask_judge.txt` + +**Files:** +- Modify: `src/config/prompts/judge/ask_judge.txt` + +- [ ] **Step 1: 写入新 prompt** + +完整替换 `ask_judge.txt` 内容为以下内容(注意 JSON 输出示例中的双花括号是 Python format 转义,实际文件写单花括号): + +``` +# Role +你是一个严格的EBM审计员。对 Ask Agent 的输出进行客观分类判断,只输出结构化 JSON,不要打分。 + +# Input +原始问题:{original_question} +路由类型:{route_type} +Ask Agent 输出:{stage_output} + +# 一票否决项(Gate) +以下任一项为 NO 时,整体判定为 gate_fail,无需继续评分。 + +## G1. intent_not_distorted +结构化结果是否忠实反映原问题意图(方向性:人群、问题类型)? +- YES:意图一致 +- NO:方向性错误(问儿童→写成人;问治疗→写诊断) + +## G2. route_correct(仅当 route_type != direct_answer 时判断,否则填 NA) +route_type 与问题类型是否匹配? +- YES:匹配 +- NO:明显错误(如诊断准确性问题路由为 ebm_pico) +- NA:route_type = direct_answer,不适用 + +## G3. nonresearch_classification_correct(仅当 route_type = direct_answer 时判断,否则填 NA) +以下三条触发条件是否全部满足? +1. 问题要求立即操作性指导(动词:如何处理/立即给/紧急处置) +2. 延迟回答会直接危及患者生命安全 +3. 答案来自已有公认标准流程(BLS/ACLS/指南操作章节) +- YES:三条均满足 +- NO:任一条不满足(应重路由到 EBM 流程) +- NA:route_type != direct_answer,不适用 + +# Rubric 评分项(仅适用于 EBM 路由;direct_answer 路由时所有 rubric 填 NA) + +## R1. core_dimensions_present【Critical,权重3】 +P + 主焦点维度(ebm_pico→I;ebm_pird→IndexTest;ebm_peo→Exposure;ebm_prognosis→PF)+ O 是否均有实质内容? +- YES:三个核心维度均有实质内容 +- PARTIAL:三者中有一个描述极度模糊(如 O="outcomes")但方向正确 +- NO:任一核心维度完全缺失或填写错误 + +## R2. secondary_dimensions_present【Major,权重2】 +次要维度(ebm_pico→C;ebm_pird→R;ebm_prognosis→TH;ebm_peo 无次要维度填 NA)是否按路由要求填写?原问题未涉及的填 NA。 +- YES:次要维度填写正确,或原问题未涉及时正确填 NA +- PARTIAL:次要维度有轻微偏差但不影响检索方向 +- NO:次要维度明显错误(如 PIRD 的 R 字段填了干预措施) +- NA:ebm_peo 路由(无次要维度) + +## R3. statement_unambiguous【Minor,权重1】 +结构化表述是否无歧义,可直接用于检索? +- YES:表述明确,无歧义 +- PARTIAL:有轻微歧义但不影响检索方向 +- NO:严重歧义,检索方向不确定 + +# Output Format +仅输出以下 JSON,不要包含任何其他文本: + +{ + "gate_results": { + "intent_not_distorted": "YES | NO", + "route_correct": "YES | NO | NA", + "nonresearch_classification_correct": "YES | NO | NA" + }, + "rubric_results": { + "core_dimensions_present": "YES | PARTIAL | NO | NA", + "secondary_dimensions_present": "YES | PARTIAL | NO | NA", + "statement_unambiguous": "YES | PARTIAL | NO | NA" + }, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | gate_fail" +} +``` + +- [ ] **Step 2: 验证格式** + +```bash +python3 -c " +from pathlib import Path +txt = Path('src/config/prompts/judge/ask_judge.txt').read_text() +assert '{original_question}' in txt +assert '{route_type}' in txt +assert '{stage_output}' in txt +assert 'gate_results' in txt +assert 'rubric_results' in txt +print('ask_judge.txt OK') +" +``` + +--- + +## Task 2: 重写 `acquire_judge.txt` + +**Files:** +- Modify: `src/config/prompts/judge/acquire_judge.txt` + +- [ ] **Step 1: 写入新 prompt** + +完整替换 `acquire_judge.txt` 内容为: + +``` +# Role +你是一个严格的EBM审计员。对 Acquire Agent 的输出进行客观分类判断,只输出结构化 JSON,不要打分。 + +# 核心EBM原则 +证据质量 ≠ 证据数量。1篇Cochrane系统评价 > 10篇RCT > 100篇病例报告。 + +# Input +路由类型:{route_type} +结构化查询:{ebm_query} +Acquire Agent 输出(已排序的证据列表):{stage_output} + +# 预处理:系统错误检测 +首先检查输入数据中是否包含 error 字段(如 "error": "Connection timeout"): +如果存在 error 字段,说明 PubMed API 调用本身失败,与检索词无关。 +此时跳过所有审计项,直接输出:search_terms_valid=YES,所有 rubric 填 NA,search_exhausted=false,failures=[],overall_quality=pass。 + +# 一票否决项(Gate) + +## G1. search_terms_valid +检索词方向是否正确,能对应到查询的核心概念? +- YES:检索词方向正确 +- NO:检索词方向完全错误(如问心衰治疗却检索肾功能指标) + +# Rubric 评分项 + +各 route_type 对应的主焦点维度: +- ebm_pico:Intervention +- ebm_pird:Index Test +- ebm_peo:Exposure +- ebm_prognosis:Prognostic Factor + +## R1. keywords_cover_pico_dimensions【Critical,权重3】 +关键词是否覆盖 P + 主焦点维度,且至少含一个可在 MeSH 验证的标准词? +- YES:覆盖 P + 主焦点维度,且含 MeSH 标准词 +- PARTIAL:覆盖了 P 或主焦点之一,但另一维度无对应关键词;或有覆盖但无 MeSH 标准词 +- NO:关键词全部指向同一概念,未覆盖多个维度 + +## R2. primary_focus_match【Critical,权重3】 +基于证据列表中主焦点匹配度最好的那篇证据判断:证据中的核心干预/暴露/测试是否与查询主焦点维度匹配? +- YES:精准匹配 +- PARTIAL:同类方法但有差异(不同剂量/版本),相关性高 +- NO:完全不同的测试/干预/暴露 + +## R3. outcome_match【Critical,权重3】 +基于证据列表中结局匹配度最好的那篇证据判断:证据是否报告了临床关心的结局指标? +- YES:报告了直接结局指标 +- PARTIAL:报告了代理指标或部分相关结局 +- NO:未报告任何相关结局 + +## R4. keywords_have_synonyms【Major,权重2】 +核心概念是否有同义词/变体(如 SGLT2i + empagliflozin + dapagliflozin)? +- YES:有同义词/变体 +- PARTIAL:有部分同义词但不完整 +- NO:无任何同义词扩展,仅有单一术语 + +## R5. keywords_count_sufficient【Major,权重2】 +关键词数量是否充足? +- YES:≥ 5 个 +- PARTIAL:3-4 个 +- NO:≤ 2 个 + +## R6. study_design_matches_route【Major,权重2】 +纳入文献的研究设计是否与 route_type 的优先级匹配? +匹配表: +- ebm_pico:第一优先级=SR/Meta分析(基于RCT),第二=RCT,第三=观察性研究,排除=机制综述/专家意见/病例报告 +- ebm_pird:第一优先级=SR/Meta分析(基于诊断准确性研究),第二=诊断准确性研究(横断面),第三=回顾性诊断研究,排除=机制综述/治疗类RCT +- ebm_peo:第一优先级=SR/Meta分析(基于观察性研究),第二=前瞻性队列,第三=病例对照,排除=RCT/机制综述 +- ebm_prognosis:第一优先级=SR/Meta分析(基于队列研究),第二=前瞻性队列,第三=回顾性队列,排除=机制综述/病例报告 +- YES:有第一优先级文献 +- PARTIAL:有次优先级文献但无第一优先级,或混入少量不匹配设计 +- NO:大量纳入与 route_type 不匹配的研究设计 + +## R7. population_match【Major,权重2】 +基于证据列表中人群匹配度最好的那篇证据判断:研究人群是否与查询 Patient 匹配? +- YES:精准匹配(相同年龄段、相同疾病状态) +- PARTIAL:有轻微差异,结论可审慎外推 +- NO:严重不匹配(成人证据用于儿科;完全不同疾病) + +## R8. top_selection_appropriate【Minor,权重1】 +排名靠前的文献(排名第1-3位)是否确实是列表中最优的证据选择? +- YES:排名前列的文献研究层级高且匹配度好 +- PARTIAL:总体合理,但有个别文献位置不最优 +- NO:排名顺序明显不合理(如病例报告排在SR/RCT前面) + +## R9. selection_count_appropriate【Minor,权重1】 +选取数量是否合理? +- YES:数量与候选质量相符 +- PARTIAL:数量略多或略少,但整体可接受 +- NO:明显不合理(大量高质量候选却只选1-2篇,或质量极差仍凑满10篇) + +## R10. key_sentences_present【Minor,权重1】 +Top 文章的 key_sentences 字段是否有实质内容? +- YES:Top 文章的 key_sentences 非空,RAG 流程正常执行 +- PARTIAL:部分文章 key_sentences 为空(摘要极短导致 chunk 失败) +- NO:所有文章 key_sentences 均为空,RAG 流程可能失败 + +# Output Format +仅输出以下 JSON,不要包含任何其他文本: + +{ + "gate_results": { + "search_terms_valid": "YES | NO" + }, + "rubric_results": { + "keywords_cover_pico_dimensions": "YES | PARTIAL | NO", + "primary_focus_match": "YES | PARTIAL | NO", + "outcome_match": "YES | PARTIAL | NO", + "keywords_have_synonyms": "YES | PARTIAL | NO", + "keywords_count_sufficient": "YES | PARTIAL | NO", + "study_design_matches_route": "YES | PARTIAL | NO", + "population_match": "YES | PARTIAL | NO", + "top_selection_appropriate": "YES | PARTIAL | NO", + "selection_count_appropriate": "YES | PARTIAL | NO", + "key_sentences_present": "YES | PARTIAL | NO" + }, + "search_exhausted": false, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | gate_fail" +} +``` + +- [ ] **Step 2: 验证格式** + +```bash +python3 -c " +from pathlib import Path +txt = Path('src/config/prompts/judge/acquire_judge.txt').read_text() +assert '{route_type}' in txt +assert '{ebm_query}' in txt +assert '{stage_output}' in txt +assert 'gate_results' in txt +assert 'rubric_results' in txt +assert 'search_exhausted' in txt +print('acquire_judge.txt OK') +" +``` + +--- + +## Task 3: 重写 `appraise_judge.txt` + +**Files:** +- Modify: `src/config/prompts/judge/appraise_judge.txt` + +- [ ] **Step 1: 写入新 prompt** + +完整替换 `appraise_judge.txt` 内容为: + +``` +# Role +你是一个严格的EBM审计员。对 Appraise Agent 的GRADE评价进行客观分类判断,只输出结构化 JSON,不要打分。 + +# 背景说明 +Appraise Agent 输出结构化的GRADE分类标签(study_type、risk_of_bias等),最终GRADE等级由系统代码根据这些标签自动计算。你的审计重点是: +1. LLM对研究类型(study_type)的识别是否正确 +2. 各降级/升级因素的分类是否合理 +3. 系统计算出的GRADE等级(computed_grade)是否与你的独立判断一致 + +# Input +证据列表:{evidence_list} +Appraise Agent 输出(包含分类标签和计算结果):{stage_output} + +# 一票否决项(Gate) + +## G1. study_type_correct +所有研究的 study_type 识别是否正确? +- YES:所有研究的 study_type 识别正确 +- NO:存在明显错误(如将观察性研究标记为RCT) + +## G2. computed_grade_reasonable +系统计算出的最终GRADE等级(computed_grade)是否合理? +- YES:计算结果与基于摘要的独立判断一致 +- NO:明显不合理(通常是 study_type 或降级因素错误导致) + +注意:以下情况属于合理结果,不应判断为 NO: +- SR/MA 纳入观察性研究(included_study_type=OBSERVATIONAL)→ 初始分为 Low,即使无降级因素也可能输出 Low/Very Low +- COHORT/CASE_CONTROL 存在 SERIOUS 偏倚时,即使 large_effect=YES 也不升级 +- CROSS_SECTIONAL 无升级因素 → 最高只能到 Low + +# Rubric 评分项 + +## R1. downgrade_factors_appropriate【Critical,权重3】 +四个降级因素(risk_of_bias/inconsistency/indirectness/imprecision)的严重程度标注是否与摘要信息相符? +- YES:各因素的严重程度标签(NOT_SERIOUS/SERIOUS/VERY_SERIOUS)与摘要信息相符 +- PARTIAL:整体合理,但个别因素评估过于宽松或严苛 +- NO:存在明显错误(如未盲法 RCT 标记为 NOT_SERIOUS 偏倚风险) + +## R2. included_study_type_correct【Critical,权重3】 +(仅当证据列表含 SYSTEMATIC_REVIEW/META_ANALYSIS/NMA 时判断,否则填 NA) +SR/MA/NMA 的 included_study_type 字段是否与摘要描述的纳入研究类型相符? +- YES:字段与摘要描述的纳入研究类型相符(如摘要明确描述"纳入RCT"→ RCT) +- PARTIAL:摘要信息不足以确认(如摘要未描述纳入类型 → UNKNOWN 是合理选择) +- NO:明显错误(如摘要写"仅纳入RCT"但标注为 OBSERVATIONAL) +- NA:证据列表中没有 SR/MA/NMA 类型研究 + +## R3. upgrade_factors_appropriate【Major,权重2】 +(仅当证据列表含 COHORT/CASE_CONTROL 时判断,否则填 NA) +升级因素(large_effect/dose_response/confounding_bias_mitigates)的标注是否与摘要信息相符? +- YES:升级因素的 YES/NO 标注与摘要信息相符 +- PARTIAL:整体合理,个别因素有轻微偏差 +- NO:明显错误(如无明确剂量效应数据但标注 dose_response=YES) +- NA:证据列表中没有 COHORT/CASE_CONTROL 研究 + +## R4. upgrade_blocked_appropriate【Major,权重2】 +(仅当含 COHORT/CASE_CONTROL 且 risk_of_bias=SERIOUS/VERY_SERIOUS 时判断,否则填 NA) +存在严重偏倚风险时,升级因素是否被正确阻断(upgrade_blocked_by_bias=True)? +- YES:risk_of_bias=SERIOUS/VERY_SERIOUS 时,upgrade_blocked_by_bias 正确标注为 True,且最终等级未因升级因素提升 +- NO:存在严重偏倚但升级因素仍被计入 +- NA:无 COHORT/CASE_CONTROL 研究,或 risk_of_bias 均为 NOT_SERIOUS + +## R5. conflicts_identified【Major,权重2】 +证据间存在实质性冲突时,冲突是否被正确识别并描述? +- YES:所有主要冲突均被识别,conflict_description 描述准确;或证据间无冲突(正确标记为无冲突) +- PARTIAL:识别了主要冲突,但有遗漏或描述不够深入 +- NO:存在明显冲突但完全未识别 + +## R6. numerical_data_extracted【Minor,权重1】 +摘要中存在效应量/CI/P值时,是否均被提取? +- YES:data_available 的判断准确,能识别摘要中存在的数值指标 +- PARTIAL:判断基本合理,有轻微偏差 +- NO:摘要有明确效应量但标记为未提取 + +# Output Format +仅输出以下 JSON,不要包含任何其他文本: + +{ + "gate_results": { + "study_type_correct": "YES | NO", + "computed_grade_reasonable": "YES | NO" + }, + "rubric_results": { + "downgrade_factors_appropriate": "YES | PARTIAL | NO", + "included_study_type_correct": "YES | PARTIAL | NO | NA", + "upgrade_factors_appropriate": "YES | PARTIAL | NO | NA", + "upgrade_blocked_appropriate": "YES | NO | NA", + "conflicts_identified": "YES | PARTIAL | NO", + "numerical_data_extracted": "YES | PARTIAL | NO" + }, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | gate_fail" +} +``` + +- [ ] **Step 2: 验证格式** + +```bash +python3 -c " +from pathlib import Path +txt = Path('src/config/prompts/judge/appraise_judge.txt').read_text() +assert '{evidence_list}' in txt +assert '{stage_output}' in txt +assert 'gate_results' in txt +assert 'rubric_results' in txt +assert 'upgrade_blocked_appropriate' in txt +print('appraise_judge.txt OK') +" +``` + +--- + +## Task 4: 重写 `apply_judge.txt` + +**Files:** +- Modify: `src/config/prompts/judge/apply_judge.txt` + +- [ ] **Step 1: 写入新 prompt** + +完整替换 `apply_judge.txt` 内容为: + +``` +# Role +你是一个严格的EBM审计员。对 Apply Agent 生成的临床推荐进行客观分类判断,只输出结构化 JSON,不要打分。 + +# Input +路由类型:{route_type} +结构化查询:{query_description} +证据评价结果(来自Appraise阶段):{appraisal_results} +Apply Agent 输出(临床推荐):{stage_output} + +# 一票否决项(Gate) + +## G1. recommendation_grounded_in_evidence +推荐意见是否基于本次检索的证据,方向与证据一致? +- YES:推荐完全来源于提供的证据,方向一致 +- NO:推荐与证据无关或方向相反 + +## G2. route_dimension_consistent +Apply 的维度一致性检查是否使用了与 route_type 匹配的框架? +各 route_type 对应的正确框架: +- ebm_pico:Population / Intervention / Comparator / Outcome +- ebm_pird:Population / Index Test / Reference Standard / Target Condition +- ebm_peo:Population / Exposure / Outcome(无 Comparator) +- ebm_prognosis:Population / Prognostic Factor / Outcome / Time Horizon +- YES:维度框架与 route_type 匹配 +- NO:使用了错误框架(如 PIRD 问题用 PICO 框架,Index Test 被映射为 Intervention) + +## G3. strength_not_grossly_inflated +推荐强度是否未严重超出证据上限? +- YES:推荐强度在证据支持范围内 +- NO:Very Low 或 Low 证据给出 Strong 推荐,或有充分高质量证据却输出 No Recommendation + +# Rubric 评分项 + +## R1. effect_size_correctly_reported【Critical,权重3】 +效应量、置信区间、GRADE 等级是否被正确转述,无数据失真? +- YES:数值被正确转述,无失真 +- PARTIAL:数值基本正确,有轻微表述偏差但不影响结论方向 +- NO:效应量或 GRADE 等级被错误转述,导致结论方向改变 + +## R2. strength_matches_evidence【Critical,权重3】 +推荐强度是否与证据等级严格匹配? +注意:inconsistency=SERIOUS 时 Moderate→Weak 的降强推荐属正确行为,不应标注为不匹配。 +EBM原则:Strong需要High/Moderate直接证据;Weak适用于Low质量或结果不一致;Conditional适用于仅有间接证据;Consensus-based适用于仅有专家共识/指南。 +- YES:推荐强度与证据等级严格匹配(含上述特殊情况) +- PARTIAL:有轻微偏差(如 Moderate 证据给 Strong,但结果高度一致且无 inconsistency 问题),临床上可接受 +- NO:推荐强度与证据等级明显不符(不触发 gate 的中等程度不匹配) + +## R3. population_applicability_addressed【Major,权重2】 +是否明确说明了证据人群与当前患者的匹配程度,包括可外推性或外推限制? +- YES:明确说明了人群匹配程度和外推性 +- PARTIAL:有提及人群差异但说明不充分 +- NO:完全未讨论人群适配性 + +## R4. uncertainty_source_explained【Major,权重2】 +不确定性的来源是否被明确说明(如样本量不足、间接证据、研究设计局限)? +- YES:不确定性来源被明确说明 +- PARTIAL:提及了不确定性但未说明来源 +- NO:未提及不确定性,或仅说"证据有限"而无来源说明 + +## R5. citation_traceable【Major,权重2】 +推荐依据是否有文献溯源(PMID 或标题可追溯)? +- YES:推荐依据有文献溯源 +- PARTIAL:部分推荐有溯源,部分缺失 +- NO:无任何文献溯源 + +## R6. recommendation_specific【Minor,权重1】 +推荐内容是否足够具体,临床医生可据此执行(含适应症、关键参数等)? +- YES:推荐包含关键细节,临床医生可直接执行 +- PARTIAL:推荐方向明确但缺少关键细节 +- NO:推荐过于模糊,无法指导临床决策 + +## R7. patient_preference_considered【Minor,权重1】 +患者偏好或价值观是否被纳入推荐表述(或明确说明不适用)? +- YES:患者偏好被纳入,或明确说明不适用 +- PARTIAL:有提及但表述笼统 +- NO:完全未提及患者偏好 + +# Output Format +仅输出以下 JSON,不要包含任何其他文本: + +{ + "gate_results": { + "recommendation_grounded_in_evidence": "YES | NO", + "route_dimension_consistent": "YES | NO", + "strength_not_grossly_inflated": "YES | NO" + }, + "rubric_results": { + "effect_size_correctly_reported": "YES | PARTIAL | NO", + "strength_matches_evidence": "YES | PARTIAL | NO", + "population_applicability_addressed": "YES | PARTIAL | NO", + "uncertainty_source_explained": "YES | PARTIAL | NO", + "citation_traceable": "YES | PARTIAL | NO", + "recommendation_specific": "YES | PARTIAL | NO", + "patient_preference_considered": "YES | PARTIAL | NO" + }, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | gate_fail" +} +``` + +- [ ] **Step 2: 验证格式** + +```bash +python3 -c " +from pathlib import Path +txt = Path('src/config/prompts/judge/apply_judge.txt').read_text() +assert '{route_type}' in txt +assert '{query_description}' in txt +assert '{appraisal_results}' in txt +assert '{stage_output}' in txt +assert 'gate_results' in txt +assert 'rubric_results' in txt +assert 'route_dimension_consistent' in txt +print('apply_judge.txt OK') +" +``` + +--- + +## Task 5: 重写 `judge_llm.py` — 核心评分架构 + +**Files:** +- Modify: `src/judge/judge_llm.py` + +- [ ] **Step 1: 替换 STAGE_WEIGHTS 为 rubric 权重表** + +在 `judge_llm.py` 顶部,将 `STAGE_WEIGHTS` 替换为: + +```python +# Rubric weight definitions per stage +# Each rubric: (weight, allows_partial) +# Gate items are not listed here — they are checked separately in _check_gates() +RUBRIC_WEIGHTS = { + "Ask": { + "core_dimensions_present": (3, True), # Critical + "secondary_dimensions_present": (2, True), # Major + "statement_unambiguous": (1, True), # Minor + }, + "Acquire": { + "keywords_cover_pico_dimensions": (3, True), + "primary_focus_match": (3, True), + "outcome_match": (3, True), + "keywords_have_synonyms": (2, True), + "keywords_count_sufficient": (2, True), + "study_design_matches_route": (2, True), + "population_match": (2, True), + "top_selection_appropriate": (1, True), + "selection_count_appropriate": (1, True), + "key_sentences_present": (1, True), + }, + "Appraise": { + "downgrade_factors_appropriate": (3, True), + "included_study_type_correct": (3, True), + "upgrade_factors_appropriate": (2, True), + "upgrade_blocked_appropriate": (2, False), # only YES/NO/NA + "conflicts_identified": (2, True), + "numerical_data_extracted": (1, True), + }, + "Apply": { + "effect_size_correctly_reported": (3, True), + "strength_matches_evidence": (3, True), + "population_applicability_addressed":(2, True), + "uncertainty_source_explained": (2, True), + "citation_traceable": (2, True), + "recommendation_specific": (1, True), + "patient_preference_considered": (1, True), + }, +} + +PASS_THRESHOLD = 0.7 +``` + +- [ ] **Step 2: 新增 `_check_gates` 函数** + +在 `RUBRIC_WEIGHTS` 定义后添加: + +```python +def _check_gates(stage: str, audit: dict) -> list: + """ + Check gate items for a stage. Returns list of failed gate names. + Any gate failure means overall fail regardless of rubric scores. + """ + gate_results = audit.get("gate_results", {}) + failed = [] + + if stage == "Ask": + if gate_results.get("intent_not_distorted") == "NO": + failed.append("intent_not_distorted") + if gate_results.get("route_correct") == "NO": + failed.append("route_correct") + if gate_results.get("nonresearch_classification_correct") == "NO": + failed.append("nonresearch_classification_correct") + + elif stage == "Acquire": + if gate_results.get("search_terms_valid") == "NO": + failed.append("search_terms_valid") + + elif stage == "Appraise": + if gate_results.get("study_type_correct") == "NO": + failed.append("study_type_correct") + if gate_results.get("computed_grade_reasonable") == "NO": + failed.append("computed_grade_reasonable") + + elif stage == "Apply": + if gate_results.get("recommendation_grounded_in_evidence") == "NO": + failed.append("recommendation_grounded_in_evidence") + if gate_results.get("route_dimension_consistent") == "NO": + failed.append("route_dimension_consistent") + if gate_results.get("strength_not_grossly_inflated") == "NO": + failed.append("strength_not_grossly_inflated") + + return failed +``` + +- [ ] **Step 3: 新增 `_score_rubrics` 函数** + +```python +def _score_rubrics(stage: str, audit: dict) -> tuple: + """ + Score rubric items using weighted rubric system. + Returns (dimension_scores, raw_issues, total_score). + NA items are excluded from denominator. + YES = full weight, PARTIAL = weight * 0.5, NO = 0. + """ + rubric_weights = RUBRIC_WEIGHTS.get(stage, {}) + rubric_results = audit.get("rubric_results", {}) + issues = [] + total_score = 0.0 + total_max = 0.0 + dimension_scores = {} + + for rubric_name, (weight, allows_partial) in rubric_weights.items(): + val = rubric_results.get(rubric_name, "NA") + if val == "NA": + dimension_scores[rubric_name] = None # excluded + continue + + if val == "YES": + score = float(weight) + elif val == "PARTIAL" and allows_partial: + score = weight * 0.5 + else: # NO or PARTIAL on non-partial rubric + score = 0.0 + + total_score += score + total_max += weight + dimension_scores[rubric_name] = score / weight # normalize to 0-1 for display + + if val == "NO": + severity = "critical" if weight == 3 else "major" if weight == 2 else "minor" + issues.append({ + "severity": severity, + "dimension": rubric_name, + "description": f"{rubric_name} 未通过(NO)", + }) + elif val == "PARTIAL": + severity = "major" if weight >= 2 else "minor" + issues.append({ + "severity": severity, + "dimension": rubric_name, + "description": f"{rubric_name} 部分通过(PARTIAL)", + }) + + overall = total_score / total_max if total_max > 0 else 1.0 + return dimension_scores, issues, overall +``` + +- [ ] **Step 4: 重写 `_score_ask`** + +```python +def _score_ask(audit: dict) -> tuple: + gate_failures = _check_gates("Ask", audit) + if gate_failures: + issues = [{"severity": "critical", "dimension": g, + "description": f"Gate 失败: {g}"} for g in gate_failures] + return {"core_dimensions_present": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" + + # direct_answer: gate passed means classification correct → terminate signal + gate_results = audit.get("gate_results", {}) + if gate_results.get("nonresearch_classification_correct") == "YES": + return {"nonresearch": 1.0}, [], False, "direct_answer路由正确,触发terminate" + + dim_scores, issues, overall = _score_rubrics("Ask", audit) + pass_threshold = overall >= PASS_THRESHOLD + failures = audit.get("failures", []) + return dim_scores, issues, False, "; ".join(failures) if failures else f"综合评分: {overall:.2f}" +``` + +- [ ] **Step 5: 重写 `_score_acquire`** + +```python +def _score_acquire(audit: dict) -> tuple: + search_exhausted = bool(audit.get("search_exhausted", False)) + if search_exhausted: + return {"search_exhausted": 1.0}, [], True, "检索穷尽,标记evidence_gap" + + gate_failures = _check_gates("Acquire", audit) + if gate_failures: + issues = [{"severity": "critical", "dimension": g, + "description": f"Gate 失败: {g}"} for g in gate_failures] + return {"search_terms_valid": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" + + dim_scores, issues, overall = _score_rubrics("Acquire", audit) + failures = audit.get("failures", []) + return dim_scores, issues, False, "; ".join(failures) if failures else f"综合评分: {overall:.2f}" +``` + +- [ ] **Step 6: 重写 `_score_appraise`** + +```python +def _score_appraise(audit: dict) -> tuple: + gate_failures = _check_gates("Appraise", audit) + if gate_failures: + issues = [{"severity": "critical", "dimension": g, + "description": f"Gate 失败: {g}"} for g in gate_failures] + return {"study_type_correct": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" + + dim_scores, issues, overall = _score_rubrics("Appraise", audit) + failures = audit.get("failures", []) + return dim_scores, issues, False, "; ".join(failures) if failures else f"综合评分: {overall:.2f}" +``` + +- [ ] **Step 7: 重写 `_score_apply`** + +```python +def _score_apply(audit: dict) -> tuple: + gate_failures = _check_gates("Apply", audit) + if gate_failures: + issues = [{"severity": "critical", "dimension": g, + "description": f"Gate 失败: {g}"} for g in gate_failures] + return {"recommendation_grounded_in_evidence": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" + + dim_scores, issues, overall = _score_rubrics("Apply", audit) + failures = audit.get("failures", []) + return dim_scores, issues, False, "; ".join(failures) if failures else f"综合评分: {overall:.2f}" +``` + +- [ ] **Step 8: 更新 `_calculate_overall_score` 以兼容新 rubric 体系** + +新的 `_score_*` 函数直接返回 overall score,`_calculate_overall_score` 只在 Assess 阶段(未改动)使用。在 `evaluate_stage` 中,对 Ask/Acquire/Appraise/Apply 阶段,overall_score 从 `_score_rubrics` 直接取得,不再走 `STAGE_WEIGHTS` 加权。 + +在 `evaluate_stage` 中修改评分计算段: + +```python +dimension_scores, raw_issues, search_exhausted, reasoning_hint = scorer(audit) + +# For rubric-based stages, compute overall from rubric scores directly +if stage in ("Ask", "Acquire", "Appraise", "Apply"): + gate_failures = _check_gates(stage, audit) + if gate_failures: + overall_score = 0.0 + else: + _, _, overall_score = _score_rubrics(stage, audit) + # Clamp NA-only edge case + overall_score = max(0.0, min(1.0, overall_score)) +else: + overall_score = self._calculate_overall_score(stage, dimension_scores) +``` + +- [ ] **Step 9: 更新 `_prepare_context` 中 Acquire 和 Apply 的字段注入** + +Acquire 阶段:将 `pico_query` 替换为 `ebm_query` + `route_type`: + +```python +elif stage == "Acquire": + ebm_query = state.get("ebm_query") + pico = state.get("pico_query") + if ebm_query: + context["route_type"] = state.get("route_type", "ebm_pico") + context["ebm_query"] = json.dumps({ + "patient": ebm_query.patient, + "primary_focus": ebm_query.primary_focus, + "comparator": getattr(ebm_query, "comparator", None), + "outcome": ebm_query.outcome, + "keywords": ebm_query.keywords, + }, ensure_ascii=False, indent=2) + elif pico: + context["route_type"] = "ebm_pico" + context["ebm_query"] = json.dumps({ + "patient": pico.patient, + "primary_focus": pico.intervention, + "comparator": pico.comparison, + "outcome": pico.outcome, + "keywords": pico.keywords, + }, ensure_ascii=False, indent=2) + else: + context["route_type"] = "ebm_pico" + context["ebm_query"] = "N/A" +``` + +Apply 阶段:将 `pico_query` 替换为 `route_type` + `query_description`: + +```python +elif stage == "Apply": + context["route_type"] = state.get("route_type", "ebm_pico") + ebm_query = state.get("ebm_query") + pico = state.get("pico_query") + if ebm_query: + context["query_description"] = json.dumps({ + "patient": ebm_query.patient, + "primary_focus": ebm_query.primary_focus, + "outcome": ebm_query.outcome, + }, ensure_ascii=False, indent=2) + elif pico: + context["query_description"] = json.dumps({ + "patient": pico.patient, + "intervention": pico.intervention, + "comparison": pico.comparison, + "outcome": pico.outcome, + }, ensure_ascii=False, indent=2) + else: + context["query_description"] = "N/A" + # appraisal_results 注入保持不变 + appraisal = state.get("appraisal_results") + if appraisal: + context["appraisal_results"] = json.dumps({ + "evidence_count": len(appraisal.evidence), + "has_conflict": appraisal.has_conflict, + "summary": appraisal.summary, + }, ensure_ascii=False, indent=2) + else: + context["appraisal_results"] = "N/A" +``` + +Ask 阶段:新增 `route_type` 注入: + +```python +if stage == "Ask": + context["original_question"] = state["original_question"] + context["route_type"] = state.get("route_type", "unknown") +``` + +- [ ] **Step 10: 运行 lint 检查** + +```bash +python3 -m ruff check src/judge/judge_llm.py +``` + +Expected: no errors (or only pre-existing warnings unrelated to this change). + +--- + +## Task 6: 新增 Appraise Layer 1 Python 校验 + +**Files:** +- Modify: `src/judge/judge_llm.py` + +- [ ] **Step 1: 新增 `_appraise_layer1_check` 函数** + +在 `judge_llm.py` 中添加: + +```python +def _appraise_layer1_check(output: dict) -> dict: + """ + Layer 1 Python hardcoded validation for Appraise stage. + Returns dict with keys: passed (bool), failures (list[str]). + If passed=True, skip LLM Judge entirely. + Raises SystemError if grade_output_in_legal_range fails. + """ + LEGAL_GRADES = {"High", "Moderate", "Low", "Very Low"} + failures = [] + + appraisal = output.get("appraisal_results") + if appraisal is None: + failures.append("appraisal_results missing") + return {"passed": False, "failures": failures} + + from dataclasses import asdict, is_dataclass + appraisal_d = asdict(appraisal) if is_dataclass(appraisal) else appraisal + evidence_list = appraisal_d.get("evidence", []) + + LEGAL_STUDY_TYPES = { + "RCT", "COHORT", "CASE_CONTROL", "CASE_REPORT", + "SYSTEMATIC_REVIEW", "META_ANALYSIS", "NMA", + "GUIDELINE", "CROSS_SECTIONAL", "NARRATIVE_REVIEW", "EXPERT_OPINION", + } + + for ev in evidence_list: + study_type = ev.get("study_type") + if not study_type or study_type not in LEGAL_STUDY_TYPES: + failures.append(f"study_type missing or illegal: pmid={ev.get('pmid','?')} study_type={study_type}") + + rob = ev.get("risk_of_bias") + if rob is None: + failures.append(f"risk_of_bias missing: pmid={ev.get('pmid','?')}") + + grade = ev.get("grade_level") + if grade and grade not in LEGAL_GRADES: + raise SystemError( + f"grade_output_in_legal_range FAILED: pmid={ev.get('pmid','?')} grade={grade}. " + "Illegal grade value — workflow terminated." + ) + + return {"passed": len(failures) == 0, "failures": failures} +``` + +- [ ] **Step 2: 在 `evaluate_stage` 中为 Appraise 阶段插入 Layer 1 前置检查** + +在 `evaluate_stage` 方法中,在 `prompt_template = self._load_prompt(stage)` 之前插入: + +```python +# Appraise Layer 1: Python hardcoded check before calling LLM Judge +if stage == "Appraise": + layer1 = _appraise_layer1_check(output) + if layer1["passed"]: + # All structural checks pass — skip LLM Judge, return pass directly + from src.state.schema import Issue as IssueSchema + evaluation = Evaluation( + overall_score=1.0, + dimension_scores={"layer1_structural": 1.0}, + pass_threshold=True, + issues=[], + summary="Layer 1 结构校验通过,跳过 LLM Judge", + search_exhausted=False, + ) + return ObserveSchema(stage=stage, output=output, evaluation=evaluation) + else: + print(f"[Appraise Layer1] 校验失败,触发 LLM Judge: {layer1['failures']}") +``` + +- [ ] **Step 3: 运行 lint 检查** + +```bash +python3 -m ruff check src/judge/judge_llm.py +``` + +--- + +## Task 7: 编写单元测试 + +**Files:** +- Create: `tests/test_judge_rubrics.py` + +- [ ] **Step 1: 创建测试文件** + +```python +"""Unit tests for Gate + Weighted Rubrics judge scoring.""" +import pytest +from src.judge.judge_llm import ( + _check_gates, + _score_rubrics, + _score_ask, + _score_acquire, + _score_appraise, + _score_apply, + _appraise_layer1_check, + RUBRIC_WEIGHTS, + PASS_THRESHOLD, +) + + +# ── _check_gates ───────────────────────────────────────────────────────────── + +def test_check_gates_ask_all_pass(): + audit = {"gate_results": { + "intent_not_distorted": "YES", + "route_correct": "YES", + "nonresearch_classification_correct": "NA", + }} + assert _check_gates("Ask", audit) == [] + + +def test_check_gates_ask_intent_fail(): + audit = {"gate_results": {"intent_not_distorted": "NO", "route_correct": "YES"}} + assert "intent_not_distorted" in _check_gates("Ask", audit) + + +def test_check_gates_ask_route_fail(): + audit = {"gate_results": {"intent_not_distorted": "YES", "route_correct": "NO"}} + assert "route_correct" in _check_gates("Ask", audit) + + +def test_check_gates_acquire_pass(): + audit = {"gate_results": {"search_terms_valid": "YES"}} + assert _check_gates("Acquire", audit) == [] + + +def test_check_gates_acquire_fail(): + audit = {"gate_results": {"search_terms_valid": "NO"}} + assert "search_terms_valid" in _check_gates("Acquire", audit) + + +def test_check_gates_appraise_study_type_fail(): + audit = {"gate_results": {"study_type_correct": "NO", "computed_grade_reasonable": "YES"}} + assert "study_type_correct" in _check_gates("Appraise", audit) + + +def test_check_gates_apply_all_fail(): + audit = {"gate_results": { + "recommendation_grounded_in_evidence": "NO", + "route_dimension_consistent": "NO", + "strength_not_grossly_inflated": "YES", + }} + failures = _check_gates("Apply", audit) + assert "recommendation_grounded_in_evidence" in failures + assert "route_dimension_consistent" in failures + assert "strength_not_grossly_inflated" not in failures + + +# ── _score_rubrics ──────────────────────────────────────────────────────────── + +def test_score_rubrics_ask_all_yes(): + audit = { + "gate_results": {"intent_not_distorted": "YES", "route_correct": "YES"}, + "rubric_results": { + "core_dimensions_present": "YES", + "secondary_dimensions_present": "YES", + "statement_unambiguous": "YES", + } + } + dim_scores, issues, overall = _score_rubrics("Ask", audit) + assert overall == pytest.approx(1.0) + assert issues == [] + + +def test_score_rubrics_ask_partial_critical(): + # core_dimensions_present=PARTIAL → 1.5/3, others YES + audit = { + "gate_results": {"intent_not_distorted": "YES", "route_correct": "YES"}, + "rubric_results": { + "core_dimensions_present": "PARTIAL", + "secondary_dimensions_present": "YES", + "statement_unambiguous": "YES", + } + } + dim_scores, issues, overall = _score_rubrics("Ask", audit) + # total_score = 1.5 + 2 + 1 = 4.5, total_max = 3+2+1 = 6 + assert overall == pytest.approx(4.5 / 6) + assert any(i["severity"] == "major" for i in issues) + + +def test_score_rubrics_ask_no_critical(): + audit = { + "gate_results": {"intent_not_distorted": "YES", "route_correct": "YES"}, + "rubric_results": { + "core_dimensions_present": "NO", + "secondary_dimensions_present": "YES", + "statement_unambiguous": "YES", + } + } + dim_scores, issues, overall = _score_rubrics("Ask", audit) + # total_score = 0 + 2 + 1 = 3, total_max = 6 + assert overall == pytest.approx(3.0 / 6) + assert any(i["severity"] == "critical" for i in issues) + + +def test_score_rubrics_na_excluded_from_denominator(): + # secondary_dimensions_present=NA → excluded + audit = { + "gate_results": {"intent_not_distorted": "YES", "route_correct": "YES"}, + "rubric_results": { + "core_dimensions_present": "YES", + "secondary_dimensions_present": "NA", + "statement_unambiguous": "YES", + } + } + dim_scores, issues, overall = _score_rubrics("Ask", audit) + # total_score = 3 + 1 = 4, total_max = 3+1 = 4 + assert overall == pytest.approx(1.0) + + +def test_score_rubrics_pass_threshold(): + # Acquire: all YES → overall=1.0 → pass + rubric_results = {k: "YES" for k in RUBRIC_WEIGHTS["Acquire"]} + audit = { + "gate_results": {"search_terms_valid": "YES"}, + "rubric_results": rubric_results, + } + _, _, overall = _score_rubrics("Acquire", audit) + assert overall >= PASS_THRESHOLD + + +# ── _score_ask gate path ────────────────────────────────────────────────────── + +def test_score_ask_gate_fail_returns_zero(): + audit = {"gate_results": {"intent_not_distorted": "NO", "route_correct": "YES"}} + dim_scores, issues, search_exhausted, hint = _score_ask(audit) + assert any(i["severity"] == "critical" for i in issues) + assert "intent_not_distorted" in hint + + +def test_score_ask_direct_answer_correct(): + audit = {"gate_results": { + "intent_not_distorted": "YES", + "route_correct": "NA", + "nonresearch_classification_correct": "YES", + }, "rubric_results": {}} + dim_scores, issues, search_exhausted, hint = _score_ask(audit) + assert "terminate" in hint or "direct_answer" in hint + + +# ── _score_acquire search_exhausted ────────────────────────────────────────── + +def test_score_acquire_search_exhausted(): + audit = {"search_exhausted": True, "gate_results": {}, "rubric_results": {}} + dim_scores, issues, search_exhausted, hint = _score_acquire(audit) + assert search_exhausted is True + + +# ── _appraise_layer1_check ──────────────────────────────────────────────────── + +def test_appraise_layer1_pass(): + from dataclasses import dataclass + from typing import Optional + + @dataclass + class FakeEvidence: + pmid: str + study_type: str + risk_of_bias: str + grade_level: Optional[str] = "Moderate" + + @dataclass + class FakeAppraisal: + evidence: list + has_conflict: bool = False + conflict_description: Optional[str] = None + summary: str = "" + + output = {"appraisal_results": FakeAppraisal(evidence=[ + FakeEvidence(pmid="123", study_type="RCT", risk_of_bias="NOT_SERIOUS"), + ])} + result = _appraise_layer1_check(output) + assert result["passed"] is True + + +def test_appraise_layer1_missing_study_type(): + from dataclasses import dataclass + from typing import Optional + + @dataclass + class FakeEvidence: + pmid: str + study_type: Optional[str] + risk_of_bias: str + grade_level: Optional[str] = None + + @dataclass + class FakeAppraisal: + evidence: list + has_conflict: bool = False + conflict_description: Optional[str] = None + summary: str = "" + + output = {"appraisal_results": FakeAppraisal(evidence=[ + FakeEvidence(pmid="456", study_type=None, risk_of_bias="NOT_SERIOUS"), + ])} + result = _appraise_layer1_check(output) + assert result["passed"] is False + assert any("study_type" in f for f in result["failures"]) + + +def test_appraise_layer1_illegal_grade_raises(): + from dataclasses import dataclass + from typing import Optional + + @dataclass + class FakeEvidence: + pmid: str + study_type: str + risk_of_bias: str + grade_level: Optional[str] + + @dataclass + class FakeAppraisal: + evidence: list + has_conflict: bool = False + conflict_description: Optional[str] = None + summary: str = "" + + output = {"appraisal_results": FakeAppraisal(evidence=[ + FakeEvidence(pmid="789", study_type="RCT", risk_of_bias="NOT_SERIOUS", grade_level="ILLEGAL"), + ])} + with pytest.raises(SystemError, match="grade_output_in_legal_range"): + _appraise_layer1_check(output) +``` + +- [ ] **Step 2: 运行测试** + +```bash +python3 -m pytest tests/test_judge_rubrics.py -v --tb=short +``` + +Expected: all tests pass. + +- [ ] **Step 3: 如有失败,修复后重跑** + +```bash +python3 -m pytest tests/test_judge_rubrics.py -v --tb=short +``` + +--- + +## Task 8: 端到端冒烟验证 + +**Files:** +- No file changes — validation only + +- [ ] **Step 1: 验证所有 prompt 文件格式占位符** + +```bash +python3 -c " +from pathlib import Path +stages = { + 'ask': ['{original_question}', '{route_type}', '{stage_output}'], + 'acquire': ['{route_type}', '{ebm_query}', '{stage_output}'], + 'appraise': ['{evidence_list}', '{stage_output}'], + 'apply': ['{route_type}', '{query_description}', '{appraisal_results}', '{stage_output}'], +} +for stage, placeholders in stages.items(): + txt = Path(f'src/config/prompts/judge/{stage}_judge.txt').read_text() + for p in placeholders: + assert p in txt, f'Missing {p} in {stage}_judge.txt' + assert 'gate_results' in txt + assert 'rubric_results' in txt + print(f'{stage}_judge.txt: OK') +print('All prompt files validated.') +" +``` + +- [ ] **Step 2: 验证 judge_llm.py 可导入** + +```bash +python3 -c " +from src.judge.judge_llm import ( + _check_gates, _score_rubrics, _score_ask, _score_acquire, + _score_appraise, _score_apply, _appraise_layer1_check, + RUBRIC_WEIGHTS, PASS_THRESHOLD +) +print('judge_llm.py imports OK') +print('Stages with rubrics:', list(RUBRIC_WEIGHTS.keys())) +" +``` + +- [ ] **Step 3: 运行完整测试套件** + +```bash +python3 -m pytest tests/ --tb=short -q || [ $? -eq 5 ] +``` + +Expected: all tests pass (exit 0 or 5 if no other tests collected). + +- [ ] **Step 4: 运行 lint** + +```bash +python3 -m ruff check src/judge/judge_llm.py src/config/prompts/ +``` + +Expected: no new errors. + +--- + +## 补充说明:`STAGE_SCORERS` 更新 + +Task 5 Step 7 完成后,需同步更新 `judge_llm.py` 中的 `STAGE_SCORERS` dispatch table,将新的 `_score_*` 函数签名对齐。 + +现有 `STAGE_SCORERS`(`judge_llm.py:826`): + +```python +STAGE_SCORERS = { + "Ask": _score_ask, + "Acquire": _score_acquire, + "Appraise": _score_appraise, + "Apply": _score_apply, + "Assess": _score_assess, +} +``` + +新的 `_score_ask/_score_acquire/_score_appraise/_score_apply` 签名与原来相同(均接受 `audit: dict`,返回 `(dim_scores, issues, search_exhausted, reasoning_hint)`),因此 `STAGE_SCORERS` 本身**无需修改**,dispatch 逻辑不变。 + +唯一需要注意的是 Task 5 Step 8 中 `evaluate_stage` 里 overall_score 的计算方式:对 Ask/Acquire/Appraise/Apply 阶段,在调用 `scorer(audit)` 之后,额外调用 `_score_rubrics(stage, audit)` 取得 overall_score,而不再走 `_calculate_overall_score(stage, dimension_scores)`。 + +--- diff --git a/docs/superpowers/specs/2026-04-20-acquire-agent-redesign.md b/docs/superpowers/specs/2026-04-20-acquire-agent-redesign.md new file mode 100644 index 0000000..780115e --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-acquire-agent-redesign.md @@ -0,0 +1,249 @@ +# Acquire Agent 重设计规范 + +**日期**: 2026-04-20 +**范围**: Acquire 阶段(`acquire_agent.py` + `acquire_agent.txt` + `acquire_ranking.txt` + `pubmed_api.py` + `schema.py` 小改) +**不在本次范围内**: Acquire Judge 的格式适配;diagnostic_reasoning 子问题的多路检索 + +--- + +## 背景与目标 + +当前 Acquire 阶段存在以下问题: + +1. **硬编码 PICO 格式**:从 `pico_query` 读取字段,无法处理 Ask 新架构输出的 `EBMQuery`(PIRD/PEO/Prognosis 格式) +2. **仅使用摘要**:PubMed 摘要信息有限,无法支撑 Appraise 阶段的完整 GRADE 评级 +3. **无全文检索**:缺乏从 PMC 获取全文的能力,证据本体无法被利用 +4. **过滤器映射依赖旧 `question_type`**:需适配新的 `route_type` 字段 + +目标:引入两段式检索(PubMed 发现 + PMC 全文获取)和混合 RAG(BM25 + Embedding),在 Listwise 排序前为每篇文章提取最相关段落,提升后续 Appraise 的证据质量。 + +--- + +## 新流程 + +``` +EBMQuery(来自 Ask 阶段) + ↓ +[LLM 构建 Boolean 查询] ← acquire_agent.txt(按 query_type 注入对应字段) + ↓ +PubMed 检索(max 20 篇,按 route_type 选过滤器) + ↓ +并行拉取 PMC 全文(有 pmcid 的文章,as_completed + timeout=10s/篇) + 无全文 → has_full_text=False,使用摘要作为 RAG 源 + 有全文 → has_full_text=True,使用全文作为 RAG 源 + ↓ +[混合 RAG 预处理](所有 20 篇,每篇独立执行) + query_string = " ".join(keywords)(拼接为单一查询串) + BM25 初筛:Top-min(8, len(chunks)) 段落 + Embedding 精排:Top-min(3, len(bm25_top)) 段落 → 写入 key_sentences + ↓ +[候选集缩减]:按 RAG 相关性分数保留 Top-10 + ↓ +[后处理分层]:has_full_text=True 的文章整体排在 has_full_text=False 之前 + ↓ +[Listwise 排序](≤10 篇,使用 key_sentences) + ↓ +Top-K 输出(key_sentences 随 Evidence 传给 Appraise) +``` + +--- + +## 一、EBMQuery 适配 + +### 过滤器映射更新 + +旧 `_FILTER_BY_QUESTION_TYPE` 替换为 `_FILTER_BY_ROUTE_TYPE`,同时保留旧映射作为兼容回退: + +| route_type | 过滤器 | 说明 | +|---|---|---| +| `ebm_pico` | `_HSSS_FILTER` | RCT + SR,治疗/干预 | +| `ebm_pird` | `_DTA_FILTER` | 诊断准确性 | +| `ebm_peo` | `_OBSERVATIONAL_FILTER` | 观察性研究,病因/危险因素 | +| `ebm_prognosis` | `_OBSERVATIONAL_FILTER` | 观察性研究,预后 | +| 旧 `question_type` 字符串 | 原有映射 | 过渡期兼容 | + +### 查询构建 prompt 字段注入(acquire_agent.txt) + +按 `query_type` 注入不同字段标签: + +| query_type | patient | primary_focus | comparator | outcome | 额外字段 | +|---|---|---|---|---|---| +| `pico` | Patient | Intervention | Comparison | Outcome | — | +| `pird` | Patient | Index Test | Reference Standard | Diagnostic Accuracy | — | +| `peo` | Patient | Exposure | —(不注入) | Outcome | — | +| `prognosis` | Patient | Prognostic Factor | —(不注入) | Outcome | time_horizon | + +### Listwise ranking prompt(acquire_ranking.txt) + +字段标签按 `query_type` 动态替换,不再硬编码"Intervention/Comparison"字样。 + +--- + +## 二、两段式检索 + +### Stage 1:PubMed 检索 + +现有逻辑保持不变。读取来源优先 `ebm_query`,回退 `pico_query`(兼容过渡期)。 + +### Stage 2:PMC 全文并行拉取 + +使用 `as_completed` 模式,每篇设置 10 秒超时,单篇失败不影响其余文章: + +```python +from concurrent.futures import ThreadPoolExecutor, as_completed + +def _fetch_full_texts(self, candidates: List[Evidence]) -> None: + futures = { + executor.submit(fetch_pmc_full_text, e.pmcid): e + for e in candidates if e.pmcid + } + try: + for future in as_completed(futures, timeout=30): + evidence = futures[future] + try: + text = future.result(timeout=10) + if text: + evidence.full_text = text + evidence.has_full_text = True + except Exception: + pass # 单篇失败:保持 has_full_text=False,继续用摘要 + except TimeoutError: + pass # 整批30秒超时:已完成的文章保留结果,未完成的保持 has_full_text=False +``` + +`fetch_pmc_full_text(pmcid)` 新增于 `pubmed_api.py`,通过 PMC OA API 获取全文 XML 并解析为纯文本。 + +--- + +## 三、混合 RAG(BM25 Top-8 → Embedding Top-3) + +### BM25-first 缺陷与缓解 + +BM25-first pipeline 在医学领域存在已知缺陷:同义词和缩写丰富(如 "myocardial infarction" vs "acute coronary syndrome"),词汇不匹配会导致语义相关段落被 BM25 过滤。 + +缓解措施(双管齐下): +1. **BM25 阈值放宽**:初筛取 Top-8,给 embedding 更大候选池 +2. **依赖 Ask 阶段 keywords 质量**:`EBMQuery.keywords` 要求包含 MeSH 词 + 同义词(Ask Judge 的 `has_synonyms_or_mesh` 已覆盖),BM25 查询串展开全部 keywords + +### `_rag_extract` 实现(含完整边界处理) + +```python +def _rag_extract(self, evidence: Evidence, query_terms: List[str]) -> Tuple[str, float]: + """返回 (key_sentences, relevance_score)。 + relevance_score = 最高 embedding cosine similarity 分数(0.0 表示降级路径)。 + """ + source = evidence.full_text if evidence.has_full_text else (evidence.abstract or "") + + # 防御性检查 + if not source or not source.strip(): + return "", 0.0 + if not query_terms: + return source[:1000], 0.0 # 降级:直接返回前段内容,分数为0 + + chunks = self._chunk_text(source, chunk_size=512) + + # chunks 数量可能少于 top_n(如摘要只产生1个 chunk) + bm25_top_n = min(8, len(chunks)) + bm25_top = bm25_retrieve(chunks, query_terms, top_n=bm25_top_n) + + # embedding 接收单一查询字符串,而非关键词列表 + query_string = " ".join(query_terms) + rerank_top_n = min(3, len(bm25_top)) + reranked, top_score = self._embedding_rerank(bm25_top, query=query_string, top_n=rerank_top_n) + # _embedding_rerank 返回 (List[str], float),top_score 为最高 cosine similarity + + key_sentences = "\n---\n".join(reranked) if reranked else source[:1000] + score = top_score if reranked else 0.0 + return key_sentences, score +``` + +**注意**:`_embedding_rerank` 需同时返回排序后的段落列表和最高相关性分数,供候选集缩减使用。 + +### Embedding 模型单例(线程安全) + +```python +import threading +_model_lock = threading.Lock() +_embedding_model = None + +def _get_embedding_model(): + global _embedding_model + with _model_lock: + if _embedding_model is None: + from sentence_transformers import SentenceTransformer + _embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + return _embedding_model +``` + +模块级单例 + 锁保护,多线程场景下安全。首次加载约 5-10 秒,模型文件约 80MB,从 HuggingFace Hub 下载(首次运行需网络)。离线部署时需提前下载并通过 `SENTENCE_TRANSFORMERS_HOME` 环境变量指定本地路径。 + +--- + +## 四、候选集缩减与后处理分层 + +### 候选集缩减(RAG 后,Listwise 前) + +RAG 预处理完成后,20 篇候选按 `_rag_extract` 返回的 `relevance_score`(最高 embedding cosine similarity)降序保留 Top-10,避免 Listwise prompt 超出 context window: + +``` +20篇 × 3段 × ~512 tokens ≈ 30,000 tokens(超出大多数模型上限) +→ 缩减到 10篇 × 3段 × ~512 tokens ≈ 15,000 tokens(可控) +``` + +降级路径(`relevance_score=0.0`)的文章排在有分数的文章之后,保证有实际相关内容的文章优先进入 Listwise。 + +### 后处理分层(Listwise 后) + +Listwise 排序完成后,强制将 `has_full_text=True` 的文章整体排在 `has_full_text=False` 之前,不依赖 prompt 指令: + +```python +def _post_sort_by_full_text(self, ranked: List[Evidence]) -> List[Evidence]: + full_text = [e for e in ranked if e.has_full_text] + abstract_only = [e for e in ranked if not e.has_full_text] + return full_text + abstract_only +``` + +Listwise 排序只负责各组内部的相关性排序,后处理保证全文组整体优先。 + +--- + +## 五、数据类变更 + +`Evidence`(`schema.py`)新增字段: + +```python +has_full_text: bool = False # 是否成功获取 PMC 全文 +``` + +(`full_text` 和 `key_sentences` 字段已存在,无需新增) + +--- + +## 六、新增依赖 + +| 库 | 用途 | 安装 | +|---|---|---| +| `rank-bm25` | BM25 检索 | `pip install rank-bm25` | +| `sentence-transformers` | Embedding 精排 | `pip install sentence-transformers` | + +--- + +## 七、文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/agents/acquire_agent.py` | 修改 | EBMQuery 适配;PMC 拉取 + RAG 流程;过滤器映射更新;embedding 线程安全单例 | +| `src/config/prompts/acquire_agent.txt` | 修改 | 支持多格式字段注入(PICO/PIRD/PEO/Prognosis) | +| `src/config/prompts/acquire_ranking.txt` | 修改 | 字段标签按 `query_type` 动态适配 | +| `src/tools/pubmed_api.py` | 修改 | 新增 `fetch_pmc_full_text(pmcid)` 函数 | +| `src/state/schema.py` | 小改 | `Evidence` 新增 `has_full_text: bool = False` | +| `requirements.txt` | 小改 | 新增 `rank-bm25`、`sentence-transformers` | + +--- + +## 明确不在本次范围内 + +- Acquire Judge 对 PIRD/PEO/Prognosis 格式的专属评分维度 +- `diagnostic_reasoning` 子问题的多路并行检索 +- Embedding 模型的替换或微调(使用默认 `all-MiniLM-L6-v2`) +- PMC 全文解析的边缘情况处理(付费文章、格式异常等) diff --git a/docs/superpowers/specs/2026-04-20-acquire-judge-redesign.md b/docs/superpowers/specs/2026-04-20-acquire-judge-redesign.md new file mode 100644 index 0000000..3209f47 --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-acquire-judge-redesign.md @@ -0,0 +1,183 @@ +# Acquire Judge 改动规范 + +**日期**: 2026-04-20 +**范围**: `acquire_judge.txt`(修改)+ `judge_llm.py` `_score_acquire` 遗留问题记录 +**不在本次范围内**: `_score_acquire` Python 侧的路由分支权重适配 + +--- + +## 背景与问题 + +现有 Acquire Judge 存在以下问题: + +1. **输入仍用 `{pico_query}`**:Acquire 新架构改为 `EBMQuery`,Judge 对 `route_type` 无感知,导致 PIRD/PEO/Prognosis 场景下"干预维度"的概念错配 +2. **`pico_p_match` / `pico_i_match` / `pico_o_match` 字段名硬编码 PICO**:在 PIRD 场景下审计的是 "Intervention" 而非 "Index Test",语义错误 +3. **`has_full_text` 未纳入审计**:新 Acquire 流程引入 PMC 全文拉取,Judge 应审计全文覆盖率 +4. **`key_sentences` 质量未审计**:RAG 提取的 key_sentences 是 Apply 阶段的核心输入,全为空说明 RAG 流程失败 + +--- + +## 改动一:输入字段更新 + +### `acquire_judge.txt` 输入段替换 + +**原:** +``` +## PICO查询 +{pico_query} +``` + +**替换为:** +``` +## 查询信息 +路由类型:{route_type} +结构化查询:{ebm_query} +``` + +Python 侧(`judge_llm.py` `evaluate_stage` Ask 阶段)已将 `route_type` 和 `ebm_query` 写入 state,此处直接读取注入。 + +--- + +## 改动二:维度匹配审计字段通用化 + +### 字段名映射 + +| route_type | 原字段名(硬编码) | 新字段名(通用) | 审计对象 | +|---|---|---|---| +| `ebm_pico` | `pico_i_match` | `primary_focus_match` | Intervention | +| `ebm_pird` | `pico_i_match` | `primary_focus_match` | Index Test | +| `ebm_peo` | `pico_i_match` | `primary_focus_match` | Exposure | +| `ebm_prognosis` | `pico_i_match` | `primary_focus_match` | Prognostic Factor | + +### 更新后的 PICO 匹配度审计段 + +``` +## 3. 查询维度匹配度审计 +**基于证据列表中查询维度匹配度最好的那篇证据进行判断。** + +各 route_type 对应的审计维度: +- ebm_pico: Patient / Intervention / Outcome +- ebm_pird: Patient / Index Test / Target Condition +- ebm_peo: Patient / Exposure / Outcome +- ebm_prognosis: Patient / Prognostic Factor / Outcome + +**p_match**:证据中的研究人群是否与查询的 Patient 匹配? +- `YES`:精准匹配(相同年龄段、相同疾病状态) +- `PARTIAL`:有轻微差异(如年龄范围略不同),结论可审慎外推 +- `NO`:严重不匹配(如成人证据用于儿科问题,或完全不同的疾病) + +**primary_focus_match**:证据中的核心干预/暴露/测试是否与查询的主焦点维度匹配? +(ebm_pico → Intervention;ebm_pird → Index Test;ebm_peo → Exposure;ebm_prognosis → Prognostic Factor) +- `YES`:精准匹配 +- `PARTIAL`:有轻微差异(同类方法,不同剂量/版本),相关性高 +- `NO`:严重不匹配(完全不同的测试/干预/暴露) + +**o_match**:证据中报告的结局是否与查询的 Outcome / Target Condition 匹配? +- `YES`:报告了临床关心的直接结局指标 +- `PARTIAL`:报告了代理指标或部分相关结局 +- `NO`:未报告任何相关结局 +``` + +同时,JSON 输出中原 `pico_p_match` / `pico_i_match` / `pico_o_match` 对应替换为 `p_match` / `primary_focus_match` / `o_match`。 + +--- + +## 改动三:新增 `full_text_audit` + +### 审计段新增 + +``` +## 5. 全文与 RAG 质量审计 + +**full_text_coverage**:Top 文章(排名前3)中,has_full_text=True 的比例是否合理? +- `GOOD`:≥2/3 篇有全文,RAG 质量有保障 +- `PARTIAL`:1/3 篇有全文,或全文获取部分失败,仍有可用摘要 +- `NONE`:Top 3 篇均无全文(has_full_text 全为 False),仅凭摘要进行 RAG + +**key_sentences_present**:key_sentences 字段是否有实质内容? +- `YES`:Top 文章的 key_sentences 非空,说明 RAG 流程正常执行 +- `PARTIAL`:部分文章的 key_sentences 为空(可能是摘要极短导致 chunk 失败) +- `NO`:所有文章的 key_sentences 均为空,RAG 流程可能失败 + +注意:key_sentences 为空时 Apply 阶段会回退到 abstract,不构成一票否决,但影响 evidence_quality 维度得分。 +``` + +--- + +## 改动四:更新系统错误检测段 + +原有"首先检查 `error` 字段"逻辑保留,固定输出中 `pico_p_match` / `pico_i_match` / `pico_o_match` 替换为新字段名: + +```python +# 错误时固定输出 +"query_match": { + "p_match": "NO", + "primary_focus_match": "NO", + "o_match": "NO" +}, +"full_text_audit": { + "full_text_coverage": "NONE", + "key_sentences_present": "NO" +} +``` + +--- + +## 完整更新后的 JSON 输出格式 + +```json +{ + "search_audit": { + "search_terms_valid": "YES | NO" + }, + "evidence_audit": { + "best_study_type": "SR_META | RCT | COHORT | CASE_CONTROL | CASE_REPORT | NONE", + "best_evidence_answers_query": "YES | PARTIAL | NO" + }, + "query_match": { + "p_match": "YES | PARTIAL | NO", + "primary_focus_match": "YES | PARTIAL | NO", + "o_match": "YES | PARTIAL | NO" + }, + "listwise_audit": { + "top_selection_appropriate": "YES | PARTIAL | NO", + "selection_count_appropriate": "YES | PARTIAL | NO" + }, + "full_text_audit": { + "full_text_coverage": "GOOD | PARTIAL | NONE", + "key_sentences_present": "YES | PARTIAL | NO" + }, + "search_exhausted": false, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | degraded" +} +``` + +`reasoning` 字段删除,替换为结构化的 `failures` + `overall_quality`,与 Ask Judge 统一输出框架。 + +--- + +## `_score_acquire` 遗留问题记录(不在本次范围) + +当前 `_score_acquire` Python 侧对所有路由使用相同的 `pico_i_match` 权重。正确做法应当: +- `ebm_pico`:`primary_focus_match`(Intervention)权重维持现有 +- `ebm_pird`:`primary_focus_match`(Index Test)权重应等同于 `p_match`(诊断研究核心) +- `ebm_peo`:`primary_focus_match`(Exposure)权重应与 `o_match` 相当(病因研究两者并重) +- `ebm_prognosis`:`primary_focus_match`(Prognostic Factor)权重较低,`p_match` 和 `o_match` 更重要 + +**后续迭代处理**,本次仅将字段名统一化,不改变权重逻辑。 + +--- + +## 文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/config/prompts/judge/acquire_judge.txt` | 修改 | 输入换为 `{route_type}` + `{ebm_query}`;维度字段通用化(`p_match` / `primary_focus_match` / `o_match`);新增 `full_text_audit`;输出改为 `failures` + `overall_quality` 统一框架;删除 `reasoning` | + +--- + +## 明确不在本次范围内 + +- `_score_acquire` Python 侧各路由的维度权重分支 +- Acquire Judge 对 diagnostic_reasoning 子问题多路检索的专项审计 diff --git a/docs/superpowers/specs/2026-04-20-apply-agent-alignment.md b/docs/superpowers/specs/2026-04-20-apply-agent-alignment.md new file mode 100644 index 0000000..4a21cb7 --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-apply-agent-alignment.md @@ -0,0 +1,235 @@ +# Apply Agent 对齐设计规范 + +**日期**: 2026-04-20 +**范围**: `apply_agent.py`(执行逻辑修改)+ `apply_agent.txt`(prompt 修改) +**不在本次范围内**: Apply Judge 的 route_type 适配;Consensus-based 推荐逻辑变更 + +--- + +## 背景与问题 + +Apply 阶段存在四处与前序改动脱节的问题: + +1. **Step 1 硬编码 PICO 维度**:Ask 新架构定义了 PICO/PIRD/PEO/Prognosis 四种格式,Apply 仍用"Population/Intervention/Outcome"框架检查所有问题,PIRD 的"Index Test"被错误映射为"Intervention",Prognosis 缺少"Time Horizon"检查 +2. **`appraisal_summary` 仅注入自由文本**:LLM 无法区分"整体 GRADE=Moderate 但 inconsistency=SERIOUS"与"整体 GRADE=Moderate 且各因素均 NOT_SERIOUS",可能产生错误的 Strong 推荐 +3. **`evidence_summary` 不含证据内容**:只传 title/quality/source,LLM 看不到 Acquire 阶段 RAG 提取的 key_sentences,无法基于实际内容生成推荐 +4. **Python 侧 GRADE enforcement 不完整**:只处理"Low/Very Low → 阻止 Strong",未处理"inconsistency=SERIOUS → 阻止 Strong" + +--- + +## 改动一:prompt 注入 `route_type` + `ebm_query`(解决问题1) + +### `apply_agent.py` execute() 新增 + +```python +# 读取路由信息(兼容过渡期:优先 ebm_query,回退 pico_query) +route_type = state.get("route_type") or "ebm_pico" +ebm_query = state.get("ebm_query") +pico_query = state.get("pico_query") + +if ebm_query: + query_description = _format_ebm_query(ebm_query) +elif pico_query: + query_description = _format_pico_query(pico_query) +else: + query_description = "N/A" +``` + +`_format_ebm_query` 和 `_format_pico_query` 输出纯文本,格式模板如下: + +``` +# _format_ebm_query(按 query_type 选择标签) +PICO: Patient: {patient} | Intervention: {primary_focus} | Comparator: {comparator} | Outcome: {outcome} +PIRD: Patient: {patient} | Index Test: {primary_focus} | Reference Standard: {reference_standard} | Target Condition: {outcome} +PEO: Patient: {patient} | Exposure: {primary_focus} | Outcome: {outcome} +Prognosis: Patient: {patient} | Prognostic Factor: {primary_focus} | Outcome: {outcome} | Time Horizon: {time_horizon} + +# _format_pico_query(旧格式兼容) +Patient: {population} | Intervention: {intervention} | Comparator: {comparison} | Outcome: {outcome} +``` + +所有 `None` 值字段输出为 `"N/A"` 而非 Python `None`,避免 prompt 中出现 `None` 字面量。 + +### `apply_agent.txt` Step 1 替换 + +**原文:** +``` +**Step 1 - PICO Consistency Check:** +- Population match +- Intervention match +- Outcome match +``` + +**替换为:** +``` +**Step 1 - Query Consistency Check:** +Route Type: {route_type} +Structured Query: {query_description} + +Check evidence applicability based on the route_type dimensions: + +- PICO: Population / Intervention / Comparator / Outcome +- PIRD: Population / Index Test / Reference Standard / Target Condition +- PEO: Population / Exposure / Outcome(no Comparator) +- Prognosis: Population / Prognostic Factor / Outcome / Time Horizon + +For each dimension of the current route_type, assess: + - Match: evidence directly matches the query dimension + - Partial: approximate match (similar but not identical population, surrogate endpoint, analogous intervention) + - Mismatch: fundamental mismatch — must flag explicitly in caveats +``` + +--- + +## 改动二:注入结构化 GRADE 字段(解决问题2) + +### `apply_agent.py` 新增 appraisal_summary 构建 + +```python +# 从 grade_rationales 提取关键降级因素摘要 +grade_rationales = state.get("grade_rationales", []) + +def _summarize_downgrade_factors(rationales: list) -> str: + """统计各降级因素中最严重的标签及出现频次。""" + factor_counts = {} + for r in rationales: + for factor in ("risk_of_bias", "inconsistency", "indirectness", "imprecision"): + val = r.get(factor, "NOT_SERIOUS") + if val in ("SERIOUS", "VERY_SERIOUS"): + factor_counts[factor] = factor_counts.get(factor, 0) + 1 + if not factor_counts: + return "All downgrade factors: NOT_SERIOUS" + return "; ".join( + f"{k}: SERIOUS/VERY_SERIOUS ({v}/{len(rationales)} studies)" + for k, v in factor_counts.items() + ) + +key_downgrade_factors = _summarize_downgrade_factors(grade_rationales) + +# inconsistency 专项标记:任一文章 inconsistency=SERIOUS/VERY_SERIOUS 则触发 +has_serious_inconsistency = any( + r.get("inconsistency") in ("SERIOUS", "VERY_SERIOUS") + for r in grade_rationales +) +consistency_flag = "SERIOUS inconsistency detected" if has_serious_inconsistency else "Consistent" +``` + +### `apply_agent.txt` 替换 `{appraisal_summary}` 注入格式 + +**原注入:** +``` +Overall Appraisal: {appraisal_summary} +``` + +**替换为:** +``` +Overall GRADE: {overall_grade} +Key downgrade factors: {downgrade_factors} +Evidence consistency: {consistency_flag} +Appraisal narrative: {appraisal_narrative} +``` + +### `apply_agent.txt` Step 3 新增 inconsistency 规则 + +在现有 Strength 规则后追加: + +``` +- If inconsistency was rated SERIOUS in appraisal (consistency_flag = "SERIOUS inconsistency + detected") → treat results as "inconsistent" → cap strength at Weak, + regardless of overall GRADE level +``` + +--- + +## 改动三:evidence_summary 注入 key_sentences(解决问题3) + +### `apply_agent.py` evidence_summary 构建修改 + +**原:** +```python +evidence_summary = "\n\n".join([ + f"Evidence {i+1}:\nTitle: {e.title}\nQuality: {e.grade_level}\nSource: {e.source}" + for i, e in enumerate(appraisal.evidence) +]) +``` + +**替换为:** +```python +evidence_summary = "\n\n".join([ + f"Evidence {i+1}:\n" + f"Title: {e.title}\n" + f"GRADE: {e.grade_level}\n" + f"Study Type: {e.study_type}\n" + f"Key Findings:\n{e.key_sentences or e.abstract or '(无摘要)'}" + for i, e in enumerate(appraisal.evidence) +]) +``` + +`key_sentences` 由 Acquire 阶段 RAG 提取写入,此处首次被 LLM 实际消费用于生成推荐。若 key_sentences 为空(过渡期未完成 RAG 改造时),回退到 abstract。 + +--- + +## 改动四:Python 侧 GRADE enforcement 补全(解决问题4) + +### `apply_agent.py` strength enforcement 修改 + +**原:** +```python +if evidence_quality in ("Very Low", "Low") and llm_strength == "Strong": + strength = "Weak" +else: + strength = llm_strength +``` + +**替换为:** +```python +llm_strength = rec_dict.get("strength", "Weak") + +# evidence_quality:从 state["appraisal_result"].overall_grade 读取(Appraise 阶段写入) +# 取值范围:"High" | "Moderate" | "Low" | "Very Low" +evidence_quality = state.get("appraisal_result", {}).get("overall_grade", "Very Low") + +# Rule 1: Low/Very Low 证据不可为 Strong +if evidence_quality in ("Very Low", "Low") and llm_strength == "Strong": + strength = "Weak" +# Rule 2: inconsistency=SERIOUS 时强制 Weak(无论 GRADE 等级) +# 触发策略:任一文章 inconsistency=SERIOUS/VERY_SERIOUS 即触发(保守策略) +# 设计意图:单篇严重不一致足以使整体证据体的方向性结论不可靠, +# 不设比例阈值,避免"多数通过"掩盖关键异质性 +elif has_serious_inconsistency and llm_strength == "Strong": + strength = "Weak" +else: + strength = llm_strength +``` + +两条规则均在 Python 侧强制执行,防止 LLM 违反 GRADE 原则。 + +--- + +## prompt 模板变量更新汇总 + +| 变量 | 原来 | 现在 | +|---|---|---| +| `{route_type}` | 无 | 新增,来自 `state["route_type"]` | +| `{query_description}` | 无 | 新增,由 `ebm_query` 或 `pico_query` 格式化 | +| `{appraisal_summary}` | 自由文本 | 拆分为4个结构化字段 | +| `{overall_grade}` | 无 | 新增,来自 Python 计算的 evidence_quality | +| `{downgrade_factors}` | 无 | 新增,来自 grade_rationales 摘要 | +| `{consistency_flag}` | 无 | 新增,"SERIOUS inconsistency detected" or "Consistent" | +| `{appraisal_narrative}` | `{appraisal_summary}` | 重命名,保留原自由文本叙述 | + +--- + +## 文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/config/prompts/apply_agent.txt` | 修改 | Step 1 按 route_type 动态一致性检查维度;新增结构化 GRADE 输入字段;Step 3 新增 inconsistency 触发 Weak 规则 | +| `src/agents/apply_agent.py` | 修改 | 注入 route_type + query_description;构建结构化 appraisal_summary(downgrade_factors、consistency_flag、appraisal_narrative);evidence_summary 加入 key_sentences;Python enforcement 补全 inconsistency Rule 2 | + +--- + +## 明确不在本次范围内 + +- Apply Judge(`judge_llm.py` `_score_apply`)的 route_type 适配 +- Consensus-based 推荐的引用格式变更 diff --git a/docs/superpowers/specs/2026-04-20-apply-judge-redesign.md b/docs/superpowers/specs/2026-04-20-apply-judge-redesign.md new file mode 100644 index 0000000..6c6efa1 --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-apply-judge-redesign.md @@ -0,0 +1,141 @@ +# Apply Judge 改动规范 + +**日期**: 2026-04-20 +**范围**: `apply_judge.txt`(修改) +**不在本次范围内**: `_score_apply` Python 侧权重调整;Consensus-based 推荐引用格式变更 + +--- + +## 背景与问题 + +现有 Apply Judge 存在以下问题,均源于与 Apply Agent 对齐改动(`2026-04-20-apply-agent-alignment.md`)脱节: + +1. **输入仍用 `{pico_query}`**:Apply 新架构注入了 `route_type + query_description`,Judge 对路由框架无感知,无法审计"维度一致性检查是否按正确路由框架执行" +2. **`strength_matches_evidence_quality` 规则缺少 inconsistency 条款**:Apply enforcement Rule 2 规定 `inconsistency=SERIOUS → 强制 Weak`,但 Judge 的评判规则没有这条,会将正确的"Moderate 证据给 Weak"误标为 MINOR_MISMATCH +3. **无路由维度一致性审计**:Apply Step 1 现按路由框架做维度一致性检查,但 Judge 没有审计"Apply 是否选用了正确的维度框架(PICO/PIRD/PEO/Prognosis)" + +--- + +## 改动一:输入字段更新 + +### `apply_judge.txt` 输入段替换 + +**原:** +``` +## PICO查询 +{pico_query} +``` + +**替换为:** +``` +## 查询信息 +路由类型:{route_type} +结构化查询:{query_description} +``` + +--- + +## 改动二:新增路由维度一致性审计 + +### 在 `## 1. 推荐-证据匹配审计` 前新增 + +``` +## 0. 路由维度一致性审计 + +**route_dimension_consistent**:Apply 的维度一致性检查(Step 1)是否使用了与 route_type 匹配的维度框架? +各 route_type 对应的正确框架: +- ebm_pico: Population / Intervention / Comparator / Outcome +- ebm_pird: Population / Index Test / Reference Standard / Target Condition +- ebm_peo: Population / Exposure / Outcome(无 Comparator) +- ebm_prognosis: Population / Prognostic Factor / Outcome / Time Horizon +- direct_answer: 不做维度一致性检查(直接操作性指导,无需 PICO 框架) + +- `YES`:维度框架与 route_type 匹配,评估覆盖了该框架的全部维度 +- `PARTIAL`:框架大致正确,但遗漏了个别维度(如 Prognosis 遗漏了 Time Horizon 检查) +- `NO`:使用了错误框架(如 PIRD 问题用 PICO 框架,Index Test 被错误映射为 Intervention) +- `NA`:route_type 为 direct_answer,不适用 +``` + +--- + +## 改动三:`strength_matches_evidence_quality` 规则补全 + +### 原规则说明(节选) + +``` +EBM原则: +- Strong推荐需要High/Moderate直接证据; +- Weak推荐适用于Low质量证据或结果不一致; +- ... +- Very Low证据且不一致 → 只能支持Weak/Conditional/Consensus-based或证据不足声明。 +``` + +### 在规则列表末尾追加 + +``` +- 若 Appraise 阶段任一研究的 inconsistency 被评为 SERIOUS/VERY_SERIOUS(即 + consistency_flag = "SERIOUS inconsistency detected"),则无论整体 GRADE 等级如何, + 推荐强度上限为 Weak——此时即使 GRADE=Moderate/High,给出 Weak 也是**正确行为**, + 不应标注为 MISMATCH。 +``` + +### 同时更新 `MINOR_MISMATCH` 和 `MAJOR_MISMATCH` 描述 + +``` +- `YES`:推荐强度与证据质量严格匹配(含 Conditional/Consensus-based 使用正确; + 含 inconsistency=SERIOUS 时 Moderate→Weak 的降强推荐) +- `MINOR_MISMATCH`:有轻微偏差(如 Moderate 证据给 Strong,但结果高度一致且无 inconsistency 问题),临床上可接受 +- `MAJOR_MISMATCH`:严重不匹配(如 Very Low/Low 证据给 Strong,或有充分直接高质量证据却输出 No Recommendation) +``` + +--- + +## 改动四:输出格式更新 + +### JSON 输出新增 `route_dimension_consistent` 字段,并统一为 `failures` + `overall_quality` 框架 + +```json +{ + "route_audit": { + "route_dimension_consistent": "YES | PARTIAL | NO | NA" + }, + "grounding_audit": { + "recommendation_based_on_evidence": "YES | PARTIAL | NO", + "uses_external_knowledge": "YES | NO" + }, + "strength_audit": { + "insufficient_evidence_appropriate": "YES | NO | NA", + "strength_matches_evidence_quality": "YES | MINOR_MISMATCH | MAJOR_MISMATCH" + }, + "actionability_audit": { + "recommendation_specific": "YES | PARTIAL | NO", + "caveats_documented": "YES | PARTIAL | NO | NA" + }, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | degraded" +} +``` + +--- + +## `_score_apply` 评分说明(无需改动) + +`route_dimension_consistent=NO` 属于 MAJOR 问题,Apply 阶段应触发 retry。现有 `_score_apply` 已有 major issue → 降分逻辑,无需额外适配。 + +Python 侧如需针对 `route_dimension_consistent=NO` 做一票否决,可在后续迭代中与其他 Judge 的 critical 路径对齐处理。 + +--- + +## 文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/config/prompts/judge/apply_judge.txt` | 修改 | 输入换为 `{route_type}` + `{query_description}`;新增 `route_dimension_consistent` 审计;`strength_matches_evidence_quality` 规则补全 inconsistency=SERIOUS 条款;输出改为 `failures` + `overall_quality` 统一框架 | + +--- + +## 明确不在本次范围内 + +- `_score_apply` Python 侧权重调整 +- Apply Judge 对 `route_dimension_consistent=NO` 的一票否决路径 +- Consensus-based 推荐的引用格式审计 diff --git a/docs/superpowers/specs/2026-04-20-appraise-agent-grade-fix.md b/docs/superpowers/specs/2026-04-20-appraise-agent-grade-fix.md new file mode 100644 index 0000000..2bbae94 --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-appraise-agent-grade-fix.md @@ -0,0 +1,187 @@ +# Appraise Agent GRADE 修正规范 + +**日期**: 2026-04-20 +**范围**: `appraise_agent.py`(`_compute_grade` 重写)+ `appraise_agent.txt`(新增字段说明) +**不在本次范围内**: PIRD 语境下 CROSS_SECTIONAL 的初始分问题;SR 纳入观察性研究时的升级因素;Appraise Judge 的格式适配 + +--- + +## 背景与问题 + +对照《循证医学的核心方法与主要模型》(表4)及 GRADE 原始文献(Guyatt 2011),现有实现存在以下错误: + +1. **升级因素缺失第三条**:GRADE 列出3个升级因素,现实现只有2个,漏掉"负偏倚(confounding_bias_mitigates)" +2. **SR/MA/NMA 初始等级固定为 High**:应取决于纳入研究类型(RCT→High;观察性→Low;混合→Moderate) +3. **CROSS_SECTIONAL 不应适用升级因素**:横断面研究不评价因果效应,升级因素在概念上不适用 +4. **观察性研究升级上限缺失**:观察性研究即使有升级因素,最多升至 Moderate,不应达到 High +5. **严重偏倚风险时升级因素不应适用**:`risk_of_bias = VERY_SERIOUS` 时允许升级违背 GRADE 核心原则 + +--- + +## 修正后的完整计算逻辑 + +### 数据表变更 + +```python +# 移除 SYSTEMATIC_REVIEW / META_ANALYSIS / NMA(改为动态计算) +_INITIAL_POINTS: Dict[str, int] = { + "RCT": 4, + "COHORT": 2, + "CASE_CONTROL": 2, + "CROSS_SECTIONAL": 2, + "NARRATIVE_REVIEW": 1, + "CASE_REPORT": 1, + "GUIDELINE": 3, # 务实简化:基于其引用的基础证据质量,保守取 Moderate + "EXPERT_OPINION": 1, +} + +# SR/MA/NMA 初始分取决于纳入研究类型 +_SR_INITIAL_POINTS: Dict[str, int] = { + "RCT": 4, # 纳入研究以 RCT 为主(≥80%)→ High + "OBSERVATIONAL": 2, # 纳入研究以观察性研究为主(≥80%)→ Low + "MIXED": 3, # RCT 占比 20%~79%(含灰区)→ Moderate(保守) + "UNKNOWN": 3, # 无法判断 → 保守取 Moderate +} + +# 仅 COHORT / CASE_CONTROL 适用升级因素 +# CROSS_SECTIONAL 不适用(不评价因果效应) +# SR/MA/NMA 当前迭代不适用升级因素(即使 included_study_type=OBSERVATIONAL) +_UPGRADE_STUDY_TYPES = {"COHORT", "CASE_CONTROL"} +``` + +### 修正后的 `_compute_grade` + +```python +def _compute_grade(appraisal: Dict) -> str: + study_type = appraisal.get("study_type", "CASE_REPORT") + + # 1. 初始分 + if study_type in ("SYSTEMATIC_REVIEW", "META_ANALYSIS", "NMA"): + included = appraisal.get("included_study_type", "UNKNOWN") + points = _SR_INITIAL_POINTS.get(included, 3) + else: + points = _INITIAL_POINTS.get(study_type, 1) + + # 2. 降级(5个因素,顺序在升级之前) + for factor in ("risk_of_bias", "inconsistency", "indirectness", "imprecision"): + points -= _DOWNGRADE_PENALTY.get(appraisal.get(factor, "NOT_SERIOUS"), 0) + if appraisal.get("publication_bias") == "SUSPECTED": + points -= 1 + + # 3. 升级(仅 COHORT / CASE_CONTROL) + if study_type in _UPGRADE_STUDY_TYPES: + # 前置条件:存在严重偏倚风险时,升级因素不适用 + # 依据:GRADE(Guyatt 2011)升级因素不能抵消严重方法学缺陷 + has_serious_bias = appraisal.get("risk_of_bias") in ("SERIOUS", "VERY_SERIOUS") + + if not has_serious_bias: + if appraisal.get("large_effect") == "YES": + points += 1 + if appraisal.get("dose_response") == "YES": + points += 1 + if appraisal.get("confounding_bias_mitigates") == "YES": + points += 1 + + # 观察性研究升级上限:Moderate(3分),不可达到 High + points = min(points, 3) + + # 4. 全局上下限 + points = max(1, min(4, points)) + return _POINTS_TO_GRADE[points] +``` + +### 各 study_type 的行为汇总 + +| study_type | 初始分 | 能否升级 | 实际上限 | +|---|---|---|---| +| RCT | 4 | 否 | High(4) | +| SR/MA/NMA(含RCT) | 4 | 否(当前迭代) | High(4) | +| SR/MA/NMA(混合) | 3 | 否(当前迭代) | Moderate(3) | +| SR/MA/NMA(含观察性) | 2 | 否(当前迭代) | Low(2) | +| GUIDELINE | 3 | 否 | Moderate(3) | +| COHORT / CASE_CONTROL | 2 | 是(无严重偏倚时) | Moderate(3) | +| CROSS_SECTIONAL | 2 | 否 | Low(2) | +| NARRATIVE_REVIEW / CASE_REPORT / EXPERT_OPINION | 1 | 否 | Very Low(1) | + +--- + +## appraise_agent.txt 新增字段说明 + +### 新增:`included_study_type`(仅 SR/MA/NMA 时填写) + +``` +included_study_type(仅当 study_type 为 SYSTEMATIC_REVIEW/META_ANALYSIS/NMA 时填写): + +- RCT:纳入研究以 RCT 为主(≥80%),适用于治疗性 SR +- OBSERVATIONAL:纳入研究以观察性研究为主(≥80%),如队列研究的 MA +- MIXED:RCT 和观察性研究均占实质性比例(RCT 20%~79% 之间) + 注意:若同时包含 RCT 和病例报告/专家意见,应视实际构成决定, + 不要因少量低质量研究而选 MIXED +- UNKNOWN:文章未报告纳入研究类型,或无法从摘要判断 + +判断规则(优先级从高到低): + 1. RCT ≥80% → RCT + 2. 观察性研究(队列/病例对照/横断面)≥80% → OBSERVATIONAL + 3. 其余(含灰区 RCT 20%~79%)→ MIXED(保守取 Moderate) + 4. 无法判断 → UNKNOWN(同 MIXED,保守取 Moderate) +``` + +### 新增升级因素:`confounding_bias_mitigates`(仅 COHORT/CASE_CONTROL) + +``` +confounding_bias_mitigates(负偏倚,仅适用于 COHORT / CASE_CONTROL): + +- YES:所有合理的残余混杂因素均使观察到的效应偏向无效(低估真实效应), + 即实际效应可能比观测值更大 → +1级 + 例:未校正的混杂因素会降低而非夸大所观察到的关联 +- NO:残余混杂方向不确定,或偏向夸大效应(高估真实效应) +- NA:不适用(非 COHORT/CASE_CONTROL,或无法判断混杂方向) +``` + +### 更新:升级因素适用范围说明 + +``` +### 三、升级因素(仅适用于 COHORT / CASE_CONTROL,且 risk_of_bias 为 NOT_SERIOUS 时) + +注意: +- CROSS_SECTIONAL 研究不适用升级因素(不评价因果效应) +- 存在 SERIOUS 或 VERY_SERIOUS 偏倚风险时,升级因素不适用 +- 观察性研究即使所有升级因素均触发,最终等级上限为 Moderate +``` + +--- + +## `grade_rationales` 新增字段 + +`appraise_agent.py` 的 `grade_rationales` 记录中新增: + +```python +grade_rationales.append({ + ... + "included_study_type": appraisal.get("included_study_type", "NA"), # SR/MA/NMA 专用 + "confounding_bias_mitigates": appraisal.get("confounding_bias_mitigates", "NA"), + "upgrade_blocked_by_bias": ( + study_type in _UPGRADE_STUDY_TYPES + and appraisal.get("risk_of_bias") in ("SERIOUS", "VERY_SERIOUS") + ), + ... +}) +``` + +`upgrade_blocked_by_bias` 字段用于向 Judge 和下游传递"升级因素因偏倚风险被阻断"的信息,供审计使用。 + +--- + +## 文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/agents/appraise_agent.py` | 修改 | `_compute_grade` 重写;`_INITIAL_POINTS` 移除 SR/MA/NMA;新增 `_SR_INITIAL_POINTS`、`_UPGRADE_STUDY_TYPES`;升级前置条件(偏倚检查);升级上限 `min(points, 3)`;`grade_rationales` 新增3个字段 | +| `src/config/prompts/appraise_agent.txt` | 修改 | 新增 `included_study_type` 字段(SR/MA/NMA 必填,含判断规则);新增 `confounding_bias_mitigates` 升级因素;更新升级因素适用范围说明 | + +--- + +## 已知遗留问题(后续迭代) + +- **CROSS_SECTIONAL 在 PIRD(诊断准确性)语境下**:DTA 研究的标准设计是横断面研究,应使用 QUADAS-2 而非 RoB 2 评价偏倚,初始分逻辑可能需按 `route_type` 分支处理 +- **SR 纳入观察性研究时的升级因素**:理论上如果纳入的队列研究有 large_effect,该 SR 也可升级,当前迭代保守不处理 diff --git a/docs/superpowers/specs/2026-04-20-appraise-judge-redesign.md b/docs/superpowers/specs/2026-04-20-appraise-judge-redesign.md new file mode 100644 index 0000000..cde4628 --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-appraise-judge-redesign.md @@ -0,0 +1,130 @@ +# Appraise Judge 改动规范 + +**日期**: 2026-04-20 +**范围**: `appraise_judge.txt`(修改) +**不在本次范围内**: Appraise Judge 的 `_score_appraise` Python 侧权重调整;PIRD 场景下 CROSS_SECTIONAL 初始分逻辑 + +--- + +## 背景与问题 + +现有 Appraise Judge 存在以下问题,均源于与 Appraise Agent GRADE 修正(`2026-04-20-appraise-agent-grade-fix.md`)脱节: + +1. **`study_type_correct` 未涵盖新 study_type**:SR/MA/NMA 现在需要 `included_study_type` 才能确定初始等级,Judge 没有审计该字段 +2. **`downgrade_factors_appropriate` 未审计升级因素**:新增第三个升级因素 `confounding_bias_mitigates`,Judge 完全未覆盖升级因素合理性 +3. **`upgrade_blocked_by_bias` 未审计**:Appraise 新增该字段,Judge 应验证:"存在 SERIOUS 偏倚时,升级因素是否被正确阻断" +4. **`computed_grade_reasonable` 判断标准基于旧逻辑**:SR+included=OBSERVATIONAL → Low(正确),但 Judge 可能将其误判为不合理 + +--- + +## 改动一:`study_type_correct` 扩展为 `study_type_audit` + +### 原审计段 + +``` +**study_type_correct**:Appraise Agent对研究类型(study_type)的识别是否准确? +- `YES`:所有研究的study_type识别正确(RCT/COHORT/CASE_CONTROL/CASE_REPORT) +- `PARTIAL`:大部分正确,个别研究类型有可商榷之处 +- `NO`:存在明显错误(如将观察性研究标记为RCT,或将RCT标记为COHORT) +``` + +### 替换为 + +``` +**study_type_correct**:Appraise Agent对研究类型(study_type)的识别是否准确? +- `YES`:所有研究的 study_type 识别正确 +- `PARTIAL`:大部分正确,个别研究类型有可商榷之处 +- `NO`:存在明显错误(如将观察性研究标记为RCT) + +**included_study_type_correct**(仅当 study_type 包含 SYSTEMATIC_REVIEW/META_ANALYSIS/NMA 时判断): +SR/MA/NMA 的 `included_study_type` 字段填写是否正确? +- `YES`:字段与摘要描述的纳入研究类型相符(如摘要明确描述"纳入RCT"→ RCT;纳入队列研究 → OBSERVATIONAL) +- `PARTIAL`:字段基本合理,但摘要信息不足以确认(如摘要未描述纳入类型 → UNKNOWN 是合理选择) +- `NO`:明显错误(如摘要写"仅纳入RCT"但标注为 OBSERVATIONAL) +- `NA`:证据列表中没有 SR/MA/NMA 类型研究 +``` + +--- + +## 改动二:新增升级因素合理性审计 + +### 在 `downgrade_factors_appropriate` 后新增 + +``` +**upgrade_factors_appropriate**(仅当证据列表中存在 COHORT/CASE_CONTROL 研究时判断): +升级因素(large_effect / dose_response / confounding_bias_mitigates)的标注是否合理? +- `YES`:升级因素的 YES/NO 标注与摘要信息相符 +- `PARTIAL`:整体合理,个别因素有轻微偏差 +- `NO`:存在明显错误(如无明确剂量效应数据但标注 dose_response=YES) +- `NA`:证据列表中没有 COHORT/CASE_CONTROL 研究 + +**upgrade_blocked_appropriate**(仅当存在 COHORT/CASE_CONTROL 且 risk_of_bias=SERIOUS/VERY_SERIOUS 时): +存在严重偏倚风险时,升级因素是否被正确阻断(upgrade_blocked_by_bias=True)? +- `YES`:risk_of_bias=SERIOUS/VERY_SERIOUS 时,upgrade_blocked_by_bias 正确标注为 True,且最终等级未因升级因素提升 +- `NO`:存在严重偏倚但升级因素仍被计入(系统 bug 信号,需上报) +- `NA`:无 COHORT/CASE_CONTROL 研究,或 risk_of_bias 均为 NOT_SERIOUS +``` + +--- + +## 改动三:更新 `computed_grade_reasonable` 判断标准说明 + +### 在该审计项说明中追加注意事项 + +``` +**computed_grade_reasonable**:系统根据分类计算出的最终GRADE等级(computed_grade)是否合理? +- `YES`:计算结果与基于摘要的独立判断一致 +- `PARTIAL`:整体合理,个别研究的等级有轻微偏差 +- `NO`:计算结果明显不合理(通常是因为study_type或降级因素分类错误导致) + +注意以下情况属于**合理结果**,不应判断为 NO: +- SR/MA 纳入观察性研究(included_study_type=OBSERVATIONAL)→ 初始分为 Low(2分),即使无降级因素也可能输出 Low/Very Low +- COHORT/CASE_CONTROL 存在 SERIOUS 偏倚时,即使 large_effect=YES 也不升级 → computed_grade 停在 Low +- COHORT/CASE_CONTROL 经升级后最高只能到 Moderate → 不应期望输出 High +- CROSS_SECTIONAL 无升级因素 → 最高只能到 Low(初始分即为2) +``` + +--- + +## 改动四:输出格式统一 + +将 `reasoning` 字段替换为 `failures` + `overall_quality`,与 Ask Judge 框架统一: + +```json +{ + "grade_audit": { + "study_type_correct": "YES | PARTIAL | NO", + "included_study_type_correct": "YES | PARTIAL | NO | NA", + "downgrade_factors_appropriate": "YES | PARTIAL | NO", + "upgrade_factors_appropriate": "YES | PARTIAL | NO | NA", + "upgrade_blocked_appropriate": "YES | NO | NA", + "computed_grade_reasonable": "YES | PARTIAL | NO" + }, + "conflict_audit": { + "conflicts_exist": "YES | NO", + "conflicts_identified": "YES | PARTIAL | NO | NA" + }, + "data_audit": { + "numerical_data_extracted": "YES | PARTIAL | NO | NA", + "confidence_level_appropriate": "HIGH | MODERATE | LOW | VERY_LOW" + }, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | degraded" +} +``` + +--- + +## 文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/config/prompts/judge/appraise_judge.txt` | 修改 | `study_type_correct` 扩展(新增 `included_study_type_correct`);新增升级因素审计(`upgrade_factors_appropriate` / `upgrade_blocked_appropriate`);更新 `computed_grade_reasonable` 注意事项;输出改为 `failures` + `overall_quality` 统一框架 | + +--- + +## 明确不在本次范围内 + +- `_score_appraise` Python 侧权重调整 +- PIRD 场景下 CROSS_SECTIONAL(横断面研究)应使用 QUADAS-2 的处理 +- SR 纳入观察性研究时的升级因素适配 diff --git a/docs/superpowers/specs/2026-04-20-ask-agent-redesign.md b/docs/superpowers/specs/2026-04-20-ask-agent-redesign.md new file mode 100644 index 0000000..a6911f6 --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-ask-agent-redesign.md @@ -0,0 +1,207 @@ +# Ask Agent 重设计规范 + +**日期**: 2026-04-20 +**范围**: Ask 阶段(`ask_agent.py` + `ask_agent.txt` + `schema.py` + `coordinator.py` 小改) +**不在本次范围内**: Acquire/Appraise/Apply/Assess 阶段的格式适配;PICo(质性研究)格式支持;多子问题并行执行架构 + +--- + +## 背景与目标 + +当前 Ask 阶段直接将用户问题结构化为 PICO,存在以下问题: + +1. **无路由**:所有问题一律走 PICO,导致诊断准确性、预后、病因等类型的问题被错误结构化 +2. **`question_type` 无 Judge 覆盖**:分类错误无法被捕获,会传导到 Acquire 的搜索过滤器选择 +3. **单一格式**:PICO 不适用于诊断准确性(应用 PIRD)、病因(PEO)、预后等问题类型 +4. **无问题性质判断**:急救操作类问题不适合走完整 5A 流程 + +目标:在 Ask 阶段引入路由机制,先判断问题性质再选择对应处理路径,并让 Judge 覆盖路由正确性验证。 + +--- + +## 整体流程 + +``` +用户输入 + │ + ▼ +[路由 LLM 调用] ← router.txt + │ + ├─ direct_answer ──────────────────────→ [直接输出 + 免责声明] → coordinator 终止流程 + │ (满足全部3条触发条件的急救/操作规范) + │ + ├─ diagnostic_reasoning ───────────────→ [Step1: 鉴别诊断 LLM] → [Step2: 串行×≤3个子PICO LLM] + │ 子PICO写入 sub_pico_queries,等待后续迭代实现并行5A流程 + │ + └─ ebm_pico / ebm_pird / ebm_peo / ebm_prognosis + │ + ▼ + [EBM结构化 LLM 调用] ← ebm_*.txt + │ + ▼ + [EBMQuery 输出] → 写入 WorkflowState → 后续 Acquire 等阶段 + +路由验证与结构化质量验证由 Ask Judge 在独立的 judge_llm.py 中实现(不在本次范围内)。 +``` + +--- + +## 路由分类 + +### 路由输出结构 + +```json +{ + "route_type": "direct_answer | diagnostic_reasoning | ebm_pico | ebm_pird | ebm_peo | ebm_prognosis", + "reasoning": "一句话路由依据" +} +``` + +### 各路由触发规则 + +| 路由类型 | 触发条件 | +|---|---| +| `direct_answer` | 同时满足3条(见下) | +| `diagnostic_reasoning` | 问题核心是"这是什么病/鉴别诊断是什么",需要从临床特征推断诊断 | +| `ebm_pico` | 治疗/干预效果比较(RCT 适用) | +| `ebm_pird` | 诊断测试的准确性/灵敏度/特异性 | +| `ebm_peo` | 病因、危险因素、有害暴露 | +| `ebm_prognosis` | 疾病自然病程、预后因素、生存率 | + +### `direct_answer` 触发的3条条件(须全部满足) + +1. 问题要求立即操作性指导(动词如:如何处理、立即给、紧急处置) +2. 延迟回答会直接危及患者生命安全 +3. 答案来自已有公认标准流程(BLS/ACLS/指南操作章节) + +**边界示例:** +- "心肺复苏按压深度" → 满足全部3条 → `direct_answer` ✓ +- "脓毒症抗生素初始选择" → 不满足条件3(无单一公认操作标准)→ `ebm_pico` +- "急性心梗用阿司匹林" → 不满足条件3 → `ebm_pico` + +--- + +## 各路由处理细节 + +### A. `direct_answer` + +单次 LLM 调用,输出急救/操作规范步骤,强制附加: +- 免责声明:"本答案来自公认操作规范,未经循证检索,仅供参考" +- 知识截止日期标注 + +输出写入 `WorkflowState.direct_answer_output`,coordinator 检测到后直接终止,跳过 Acquire 等阶段。 + +### B. `diagnostic_reasoning` + +**Step1 LLM 调用**(diag_step1.txt): + +输入:原始问题 +输出: +```json +{ + "clinical_features": ["症状/体征/检查结果"], + "differential_diagnoses": [ + { "diagnosis": "xxx", "priority": 1, "rationale": "危重,需优先排除" }, + { "diagnosis": "yyy", "priority": 2, "rationale": "最可能" }, + { "diagnosis": "zzz", "priority": 3, "rationale": "常见鉴别" } + ] +} +``` + +Prompt 硬约束:输出上限3个诊断,优先排序规则:需立即排除的危重疾病 > 最可能的诊断 > 常见鉴别。 + +**Step2 LLM 调用(串行,每次1个诊断)**(diag_step2.txt): + +输入模板(每次仅传入1个诊断): +``` +患者临床特征:{clinical_features} +当前鉴别诊断:{single_diagnosis} +任务:将该诊断转化为 EBM 可检索的子问题 +``` + +输出:针对该诊断的 `EBMQuery`(通常为 `ebm_pico` 类型) + +所有子问题写入 `WorkflowState.sub_pico_queries`。**本次迭代不实现并行5A执行**,子问题的后续处理留待下一迭代。 + +### C. EBM 格式结构化(4种) + +每种格式对应独立 prompt 文件,输出统一为 `EBMQuery`。 + +--- + +## 数据类设计 + +### 新增 `EBMQuery` + +```python +@dataclass +class EBMQuery: + query_type: str # "pico" | "pird" | "peo" | "prognosis" + patient: str # P(所有格式共用) + primary_focus: str # PICO→intervention;PIRD→index_test;PEO→exposure;Prognosis→prognostic_factor + comparator: Optional[str] # PICO→comparison;PIRD→reference_standard;PEO/Prognosis→None(不适用) + outcome: str # O/D(所有格式共用) + keywords: List[str] # 英文 MeSH 关键词 + reference_standard: Optional[str] = None # PIRD 专用(R字段) + time_horizon: Optional[str] = None # Prognosis 专用 +``` + +PIRD 字段映射(明确修正): +- P = `patient` +- I = `primary_focus`(index test,待评估的诊断测试) +- R = `comparator` + `reference_standard`(参考标准/金标准,冗余存储以保持语义) +- D = `outcome`(诊断准确性结局) + +`PICOQuery` 保持不变(向后兼容)。过渡期内 `WorkflowState` 同时保留 `pico_query` 和新的 `ebm_query`;非 PICO 路由使用 `ebm_query`,Acquire 等下游阶段读取 `query_type` 后当前迭代降级为 PICO 行为,后续迭代逐格式适配。 + +### `WorkflowState` 新增字段 + +```python +route_type: Optional[str] # 路由结果 +route_confidence: Optional[str] # "normal"(默认,路由首次通过)| "low"(重试超限后 fallback 标记) + # 路由 LLM 调用成功后无论是否重试,均写入该字段;初始值 None 仅在 Ask 阶段未执行时存在 +direct_answer_output: Optional[str] # direct_answer 类的最终输出 +ebm_query: Optional[EBMQuery] # 非PICO格式的结构化输出 +sub_pico_queries: Optional[List[EBMQuery]] # 诊断推理的子问题列表 +sub_question_index: Optional[int] # 当前处理第几个子问题(0-based) +sub_question_total: Optional[int] # 子问题总数 +``` + +--- + +## Prompt 文件结构 + +``` +src/config/prompts/ask/ +├── router.txt # 路由分类(含3条 direct_answer 触发条件) +├── direct_answer.txt # 急救/操作规范直接回答 +├── diag_step1.txt # 鉴别诊断生成(MAX=3 硬约束 + 优先排序规则) +├── diag_step2.txt # 单诊断→EBMQuery 转化(每次1个诊断) +├── ebm_pico.txt # PICO 格式(从 ask_agent.txt 迁移改写) +├── ebm_pird.txt # PIRD 格式(P/I/R/D 字段明确定义) +├── ebm_peo.txt # PEO 格式 +└── ebm_prognosis.txt # 预后格式(含 time_horizon) +``` + +旧 `src/config/prompts/ask_agent.txt` 废弃,功能由 `ask/ebm_pico.txt` 替代。 + +--- + +## 文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/config/prompts/ask/` | 新建目录,8个文件 | 见上方 Prompt 文件结构 | +| `src/agents/ask_agent.py` | 重写 | 路由→Judge→分支调用→统一输出 | +| `src/state/schema.py` | 扩展 | 新增 `EBMQuery`,`WorkflowState` 新增6个字段 | +| `src/coordinator/coordinator.py` | 小改 | 检测 `route_type == "direct_answer"` 后提前终止 | +| `src/config/prompts/ask_agent.txt` | 废弃(保留文件,不删除) | 由 `ask/ebm_pico.txt` 替代 | + +--- + +## 明确不在本次范围内 + +- Acquire/Appraise/Apply/Assess 对非PICO格式的完整适配(当前降级为PICO行为) +- `diagnostic_reasoning` 子问题的并行5A执行(子问题已结构化,执行逻辑留待下一迭代) +- PICo(质性研究)格式支持(需 CERQual 评价框架,单独迭代) +- `ebm_query` 完全替换 `pico_query`(本次过渡期并存) diff --git a/docs/superpowers/specs/2026-04-20-ask-judge-redesign.md b/docs/superpowers/specs/2026-04-20-ask-judge-redesign.md new file mode 100644 index 0000000..02bd6cb --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-ask-judge-redesign.md @@ -0,0 +1,428 @@ +# Ask Judge 重设计规范 + +**日期**: 2026-04-20 +**范围**: `ask_judge.txt`(重写)+ `judge_llm.py`(`_score_ask` 重写 + `_precheck_ask` 新增)+ `coordinator.py`(小改)+ `apply_agent.py`(小改) +**不在本次范围内**: 其他阶段 Judge 的 route_type 适配 + +--- + +## 背景与问题 + +原 Ask Judge 存在以下问题: + +1. **`route_type` 完全未被审计**:Ask 新架构的核心输出之一,分类错误无法捕获 +2. **非 PICO 路由用错误框架审计**:PIRD/PEO/Prognosis 被套用 PICO 四要素,产生误判 +3. **`keywords_english_medical`/`has_synonyms_or_mesh` 由 LLM 判断**:可规则化的格式检查浪费 LLM 调用 +4. **`routing_decision` 原设计由 LLM 输出**:违背"LLM 分类→Python 计算"架构原则 +5. **`_score_diagnostic_reasoning` 无权重定义** +6. **Pass/Fail 阈值与 `dimension_scores` 转化关系未明确** +7. **`reasoning` 字段信息密度不足**,无法支撑决策模型 +8. **`format_match` 与路由验证职责重叠**(冗余,删除) +9. **`route_appropriateness` PARTIAL 处理逻辑缺失**(简化为 YES/NO) + +--- + +## 架构决策 + +| 决策 | 选择 | 理由 | +|---|---|---| +| Prompt 共用 vs 独立 | 单 prompt + Python 动态注入对应路由段落 | LLM 不处理条件判断,prompt 精简 | +| 两阶段 vs 一阶段 Judge | 合并为一次调用 | 路由和结构化是同一 Ask 调用的输出,分两次引入的状态传递复杂度不值得 | +| `routing_decision` | Python 推导,不由 LLM 输出 | 与整个 Judge 架构(LLM 分类→Python 计算)保持一致 | +| `route_appropriateness` | 简化为 YES/NO | PARTIAL 无明确后续动作;歧义由 `ambiguity_flag` 单独承担 | +| `diagnostic_reasoning` Judge | 所有子PICO完成后一次批量调用 | 避免 N+1 次调用;与其他路由调用次数对齐 | +| `keywords_english_medical`/`has_synonyms_or_mesh` | 移至 Python 预检 | 正则/数组操作,无需 LLM 语义理解 | +| `format_match` | 删除 | 与路由验证职责重叠 | + +--- + +## Python 预检(调用 Judge LLM 前) + +```python +def _precheck_ask(pico_dict: dict) -> dict: + """在调用 Judge LLM 前,用 Python 规则完成可确定性检查。""" + import re + chinese = re.compile(r'[\u4e00-\u9fff]') + keywords = pico_dict.get("keywords", []) + + keywords_english = not any(chinese.search(kw) for kw in keywords) + has_synonyms = len(set(keywords)) >= 2 + keyword_count_ok = len(keywords) > 1 # False → boolean_logic_valid 直接 NO + + return { + "keywords_english_medical": "YES" if keywords_english else "NO", + "has_synonyms_or_mesh": "YES" if has_synonyms else "NO", + "keyword_count_ok": keyword_count_ok, + } +``` + +预检结果直接写入 `_score_ask`,不传给 LLM Judge。 + +--- + +## Judge Prompt 结构 + +### 固定段(所有路由共用) + +``` +# Role +你是一个严格的EBM审计员,对 Ask Agent 的输出进行客观分类判断,只输出结构化 JSON,不要打分。 + +# Input +原始问题:{original_question} +路由类型:{route_type} +Ask Agent 输出:{stage_output} + +# 固定审计项 + +## 路由审计 +route_appropriateness: YES | NO +- YES:route_type 与问题类型匹配 +- NO:明显不符(如诊断准确性问题路由为 ebm_pico) + +ambiguity_flag: YES | NO +- YES:问题本身存在路由歧义(如同时涉及诊断和治疗),当前路由是合理选择之一 +- NO:路由方向明确 + +## 安全审计(Critical,一票否决) +intent_distorted: YES | NO +- YES:结构化结果扭曲了原问题核心意图(方向性错误:问儿童→写成人;问治疗→写诊断) +- NO:意图基本一致 +注意:intent_distorted 关注方向性错误,不关注描述清晰度; + pico_statement_unambiguous 关注描述清晰度(两者独立,可同时触发) + +## 关键词语义审计(仅语义合理性;英文/数量已由 Python 预检) +boolean_logic_valid: YES | NO +- YES:关键词覆盖了问题的核心维度(至少覆盖 P + I/Exposure/IndexTest 两个维度),无明显冗余 +- NO:关键词全部指向同一概念,或包含大量明显无关词,或数量严重不足 +``` + +### 动态注入段(Python 按 route_type 选择注入) + +**ebm_pico:** +``` +## 结构审计 (PICO) +P: YES|PARTIAL|NO 患者/人群是否明确(年龄、疾病状态等) +I: YES|PARTIAL|NO 干预措施是否明确 +C: YES|NA|NO 对照组(原问题不涉及对照→NA) +O: YES|PARTIAL|NO 临床结局是否明确 +pico_statement_unambiguous: YES|PARTIAL|NO + YES=表述明确无歧义;PARTIAL=轻微歧义不影响检索方向;NO=严重歧义难以检索 +``` + +**ebm_pird:** +``` +## 结构审计 (PIRD) +P: YES|PARTIAL|NO 患者人群是否明确 +I: YES|PARTIAL|NO Index Test(待评估的诊断测试)是否明确 +R: YES|PARTIAL|NA Reference Standard(金标准)是否明确(原问题未提及→NA) +D: YES|PARTIAL|NO Target Condition(诊断结局)是否明确 +pico_statement_unambiguous: YES|PARTIAL|NO +``` + +**ebm_peo:** +``` +## 结构审计 (PEO) +P: YES|PARTIAL|NO 患者人群是否明确 +E: YES|PARTIAL|NO Exposure(暴露因素)是否明确 +O: YES|PARTIAL|NO Outcome(结局)是否明确 +(PEO 无 Comparator,不审计 C 字段) +pico_statement_unambiguous: YES|PARTIAL|NO +``` + +**ebm_prognosis:** +``` +## 结构审计 (Prognosis) +P: YES|PARTIAL|NO 患者人群是否明确 +PF: YES|PARTIAL|NO Prognostic Factor(预后因素)是否明确 +O: YES|PARTIAL|NO 结局是否明确 +TH: YES|PARTIAL|NA Time Horizon(随访时间窗)是否明确(原问题未提及→NA) +pico_statement_unambiguous: YES|PARTIAL|NO +``` + +**direct_answer:** +``` +## 结构审计 (direct_answer) +all_three_conditions_met: YES | NO +三个条件(须全部满足才应路由到 direct_answer): + 1. 问题要求立即操作性指导(动词:如何处理/立即给/紧急处置) + 2. 延迟回答会直接危及患者生命安全 + 3. 答案来自已有公认标准流程(BLS/ACLS/指南操作章节) +YES=三条均满足;NO=任一条不满足(应重新路由到 EBM 流程) + +standard_protocol_cited: YES | NO 是否引用了公认标准操作规范 +``` + +**diagnostic_reasoning(Step1+所有Step2完成后批量):** +``` +## 结构审计 (Diagnostic Reasoning) + +### 鉴别诊断质量(Step1) +clinical_feature_completeness: YES|PARTIAL|NO 关键症状/体征/检查是否遗漏 +differential_reasonableness: YES|PARTIAL|NO 鉴别诊断是否与临床特征匹配 +critical_diagnosis_prioritized: YES|NO 危重/需立即排除的诊断是否排在前列 + +### 子PICO对应关系(Step2,批量) +sub_pico_audit: 数组,每个元素: + - diagnosis: 对应的鉴别诊断名称 + - correspondence: YES|PARTIAL|NO + - issue: 若非YES,说明具体问题;否则填null +``` + +### 输出格式(所有路由共用框架) + +```json +{ + "route_audit": { + "route_appropriateness": "YES | NO", + "ambiguity_flag": "YES | NO" + }, + "safety_audit": { + "intent_distorted": "YES | NO" + }, + "search_audit": { + "boolean_logic_valid": "YES | NO" + }, + "structure_audit": { + /* 动态字段,按 route_type 变化,见上方各段 */ + }, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | degraded" +} +``` + +`routing_decision` 不在 LLM 输出中,由 Python 推导。 + +--- + +## Python 评分:`_score_ask` + +### 维度权重 + +```python +STAGE_WEIGHTS = { + "Ask": { + "pico_completeness": 0.45, + "searchability": 0.30, + "clarity": 0.25, + }, + # 其他阶段不变 +} + +PASS_THRESHOLD = 0.70 # 沿用现有阈值 +``` + +### 评分逻辑 + +```python +def _score_ask(audit: dict, precheck: dict, route_type: str + ) -> Tuple[dict, list, bool, str]: + issues = [] + + # 0. Python 预检失败项注入 issues + if precheck["keywords_english_medical"] == "NO": + issues.append({"severity": "major", "dimension": "searchability", + "description": "keywords 包含中文,必须全部使用英文医学术语(MeSH)"}) + if precheck["has_synonyms_or_mesh"] == "NO": + issues.append({"severity": "minor", "dimension": "searchability", + "description": "缺少同义词扩展,请为核心概念补充 MeSH 词或常见别名"}) + if not precheck["keyword_count_ok"]: + issues.append({"severity": "major", "dimension": "searchability", + "description": "关键词数量不足(≤1),无法构成有效检索策略"}) + + # 1. 安全项:intent_distorted(一票否决) + if audit.get("safety_audit", {}).get("intent_distorted") == "YES": + return ( + {"pico_completeness": 0.0, "searchability": 0.0, "clarity": 0.0}, + [{"severity": "critical", "dimension": "pico_completeness", + "description": "PICO结构化结果严重扭曲了用户原始意图"}], + False, "意图严重扭曲,任务失败" + ) + + # 2. 路由失败(route_appropriateness=NO,一票否决) + if audit.get("route_audit", {}).get("route_appropriateness") == "NO": + return ( + {"pico_completeness": 0.0, "searchability": 0.0, "clarity": 0.0}, + [{"severity": "critical", "dimension": "pico_completeness", + "description": "路由分类错误,需重新路由"}], + False, "路由错误,需重试" + ) + + # 3. 结构化得分(按 route_type 分支) + structure = audit.get("structure_audit", {}) + + if route_type in ("ebm_pico", "ebm_pird", "ebm_peo", "ebm_prognosis"): + pico_completeness = _score_structure_fields(structure, route_type, issues) + elif route_type == "direct_answer": + pico_completeness = 1.0 if structure.get("all_three_conditions_met") == "YES" else 0.0 + if structure.get("all_three_conditions_met") == "NO": + issues.append({"severity": "critical", "dimension": "pico_completeness", + "description": "direct_answer 三个触发条件未全部满足,应重新路由到 EBM 流程"}) + elif route_type == "diagnostic_reasoning": + pico_completeness = _score_diagnostic_reasoning(structure, issues) + else: + pico_completeness = 0.5 + + # 4. searchability(Python 预检 + LLM boolean_logic) + kw_score = 1.0 if precheck["keywords_english_medical"] == "YES" else 0.0 + syn_score = 1.0 if precheck["has_synonyms_or_mesh"] == "YES" else 0.0 + bl_score = 1.0 if audit.get("search_audit", {}).get("boolean_logic_valid") == "YES" else 0.0 + searchability = (kw_score + syn_score + bl_score) / 3 + + # 5. clarity + clarity_map = {"YES": 1.0, "PARTIAL": 0.5, "NO": 0.1} + clarity = clarity_map.get(structure.get("pico_statement_unambiguous", "YES"), 1.0) + if structure.get("pico_statement_unambiguous") == "NO": + issues.append({"severity": "major", "dimension": "clarity", + "description": "PICO表述存在严重歧义,请重新提炼问题"}) + elif structure.get("pico_statement_unambiguous") == "PARTIAL": + issues.append({"severity": "minor", "dimension": "clarity", + "description": "PICO表述存在轻微歧义,请澄清不明确的术语"}) + + dimension_scores = { + "pico_completeness": pico_completeness, + "searchability": searchability, + "clarity": clarity, + } + return dimension_scores, issues, False, "; ".join(audit.get("failures", [])) +``` + +### EBM 格式结构字段权重(`_score_structure_fields`) + +| 字段 | PICO | PIRD | PEO | Prognosis | YES | PARTIAL | NO | NA | +|---|---|---|---|---|---|---|---|---| +| P(人群) | 3 | 3 | 3 | 3 | 1.0 | 0.4 | 0.0 | 1.0 | +| I/IndexTest/Exposure/PF | 3 | 3 | 3 | 3 | 1.0 | 0.4 | 0.0 | — | +| C/R | 1 | 2 | — | — | 1.0 | 0.4 | 0.0 | 1.0 | +| O/D | 2 | 2 | 2 | 2 | 1.0 | 0.4 | 0.0 | — | +| TH(time_horizon) | — | — | — | 1 | 1.0 | 0.4 | 0.0 | 1.0 | + +分数 = Σ(字段权重 × 字段得分) / Σ字段权重 + +### `_score_diagnostic_reasoning` 权重 + +```python +def _score_diagnostic_reasoning(structure: dict, issues: list) -> float: + label_map = {"YES": 1.0, "PARTIAL": 0.4, "NO": 0.0} + + # Step1:鉴别诊断质量(60%) + # critical_diagnosis_prioritized 权重最高(患者安全) + cf = label_map.get(structure.get("clinical_feature_completeness", "YES"), 1.0) + dr = label_map.get(structure.get("differential_reasonableness", "YES"), 1.0) + cp = 1.0 if structure.get("critical_diagnosis_prioritized") != "NO" else 0.0 + step1 = cf * 0.30 + dr * 0.30 + cp * 0.40 + + if structure.get("clinical_feature_completeness") == "NO": + issues.append({"severity": "major", "dimension": "pico_completeness", + "description": "关键临床特征提取不完整,鉴别诊断可能遗漏重要线索"}) + if structure.get("differential_reasonableness") == "NO": + issues.append({"severity": "major", "dimension": "pico_completeness", + "description": "鉴别诊断与临床特征不匹配,请重新分析"}) + if structure.get("critical_diagnosis_prioritized") == "NO": + issues.append({"severity": "critical", "dimension": "pico_completeness", + "description": "危重/需立即排除的诊断未排在首位,存在患者安全风险"}) + + # Step2:子PICO对应关系(40%) + sub_audits = structure.get("sub_pico_audit", []) + if not sub_audits: + step2 = 1.0 # 尚未生成子PICO,不扣分 + else: + corr_scores = [label_map.get(s.get("correspondence", "YES"), 1.0) + for s in sub_audits] + step2 = sum(corr_scores) / len(corr_scores) + for s in sub_audits: + if s.get("correspondence") == "NO": + issues.append({"severity": "major", "dimension": "pico_completeness", + "description": f"子PICO({s.get('diagnosis','?')})" + f"与鉴别诊断不对应:{s.get('issue','')}"}) + elif s.get("correspondence") == "PARTIAL": + issues.append({"severity": "minor", "dimension": "pico_completeness", + "description": f"子PICO({s.get('diagnosis','?')})" + f"对应关系有偏差:{s.get('issue','')}"}) + + return step1 * 0.60 + step2 * 0.40 +``` + +--- + +## `routing_decision` Python 推导 + +```python +def _derive_routing_decision(audit: dict, pass_threshold: bool, + retry_count: int, max_retry: int = 2) -> str: + route_ok = audit.get("route_audit", {}).get("route_appropriateness") == "YES" + intent_ok = audit.get("safety_audit", {}).get("intent_distorted") == "NO" + + if not intent_ok: + return "retry_structure" if retry_count < max_retry else "fallback" + if not route_ok: + return "retry_route" if retry_count < max_retry else "fallback" + if pass_threshold: + return "proceed" + return "retry_structure" if retry_count < max_retry else "fallback" +``` + +### Pass/Fail 判定 + +```python +overall_score = _calculate_overall_score("Ask", dimension_scores) +has_critical = any(i["severity"] == "critical" for i in raw_issues) +pass_threshold = (overall_score >= PASS_THRESHOLD) and not has_critical +``` + +Pass 条件:加权分 ≥ 0.70 **且** 无 critical issue。 + +### 完整决策流 + +``` +overall_score ≥ 0.70 且无 critical + → pass_threshold=True → routing_decision="proceed" → 进入 Acquire + +overall_score < 0.70 或有 critical(路由错误) + → routing_decision="retry_route" → 重新路由(最多2次) + +overall_score < 0.70 或有 critical(结构化不达标) + → routing_decision="retry_structure" → 重新结构化(最多2次) + +超过 max_retry + → routing_decision="fallback" → route_confidence="low",强制 ebm_pico 继续 +``` + +--- + +## `route_confidence` 下游传递 + +```python +# judge_llm.py evaluate_stage()(Ask 阶段) +if stage == "Ask": + retry_count = state.get("agent_call_counts", {}).get("Ask", 1) - 1 + routing_decision = _derive_routing_decision(audit, pass_threshold, retry_count) + state["_ask_routing_decision"] = routing_decision + if routing_decision == "fallback": + state["route_confidence"] = "low" + +# apply_agent.py:生成推荐时 +if state.get("route_confidence") == "low": + recommendation.caveats.append( + "本问题的结构化框架存在路由不确定性(Ask 阶段降级处理),推荐结论需结合临床判断" + ) +``` + +--- + +## 文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/config/prompts/judge/ask_judge.txt` | 重写 | 单 prompt + 动态注入;路由/安全/搜索/结构四块;删除 `format_match`;`failures`+`overall_quality` 输出;删除 `routing_decision` 输出字段 | +| `src/judge/judge_llm.py` | 修改 | `_precheck_ask` 新增;`_score_ask` 重写(含分支权重);`_score_structure_fields` 新增;`_score_diagnostic_reasoning` 新增;`_derive_routing_decision` 新增;`evaluate_stage` 中 Ask 阶段写入 `route_confidence` | +| `src/coordinator/coordinator.py` | 小改 | 读取 `_ask_routing_decision` 执行 retry_route / retry_structure / fallback 分支 | +| `src/agents/apply_agent.py` | 小改 | 检测 `route_confidence="low"` 时追加 caveat | + +--- + +## 明确不在本次范围内 + +- Acquire/Appraise/Apply/Assess Judge 的 route_type 适配 +- `ambiguity_flag=YES` 时的 UI 提示(当前仅写入 `route_confidence` 日志) +- `WorkflowState` 中 `route_confidence` 字段的持久化格式(实现阶段决定) diff --git a/docs/superpowers/specs/2026-04-20-assess-judge-redesign.md b/docs/superpowers/specs/2026-04-20-assess-judge-redesign.md new file mode 100644 index 0000000..d1f30cc --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-assess-judge-redesign.md @@ -0,0 +1,139 @@ +# Assess Judge 改动规范 + +**日期**: 2026-04-20 +**范围**: `assess_judge.txt`(修改) +**不在本次范围内**: `_score_assess` Python 侧权重调整;Assess Agent 本身的逻辑改动 + +--- + +## 背景与问题 + +现有 Assess Judge 存在以下问题: + +1. **输入仍用 `{pico_query}`**:所有前序阶段已切换到 `route_type + ebm_query` 架构,Assess Judge 的全链路回顾应当感知路由类型 +2. **`ask_to_acquire_link` 描述硬编码 P/I/O**:PIRD 场景下应检查 "Index Test 覆盖",Prognosis 应检查 "Prognostic Factor",现在全部写死为 "P/I/O 要素",导致 PIRD/Prognosis 场景下审计逻辑错误 +3. **`route_confidence` 未感知**:Ask Judge 可能降级(fallback → `route_confidence=low`),Assess 作为最终链路审查,应当认知这个状态并在完整性审计中特别标注,否则会将"因路由不确定性导致的证据缺失"误判为链路质量问题 + +--- + +## 改动一:输入字段更新 + +### `assess_judge.txt` 输入段替换 + +**原:** +``` +## 完整推理链摘要 +- PICO查询: {pico_query} +- 证据数量: {evidence_count} +- 证据质量分布: {grade_distribution} +- 最终推荐: {recommendation} +``` + +**替换为:** +``` +## 完整推理链摘要 +- 路由类型: {route_type} +- 路由置信度: {route_confidence} +- 结构化查询: {ebm_query} +- 证据数量: {evidence_count} +- 证据质量分布: {grade_distribution} +- 最终推荐: {recommendation} +``` + +`route_confidence` 取值为 `"normal"`(默认)或 `"low"`(Ask 阶段 fallback 时写入)。 + +--- + +## 改动二:`ask_to_acquire_link` 描述动态化 + +### 原描述 + +``` +**ask_to_acquire_link**:Ask阶段的PICO是否有效指导了Acquire阶段的检索? +- `CLEAR`:检索策略直接来源于PICO,关键词与P/I/O要素对应明确 +- `WEAK`:关联存在但不够紧密,检索词覆盖了PICO的主要方面但有跳跃 +- `BROKEN`:检索策略与PICO脱节,检索了与PICO无关的主题 +``` + +### 替换为 + +``` +**ask_to_acquire_link**:Ask 阶段的结构化查询是否有效指导了 Acquire 阶段的检索? +各 route_type 对应的审计重点: +- ebm_pico: 关键词是否覆盖 Patient + Intervention + Outcome +- ebm_pird: 关键词是否覆盖 Patient + Index Test + Target Condition +- ebm_peo: 关键词是否覆盖 Patient + Exposure + Outcome +- ebm_prognosis: 关键词是否覆盖 Patient + Prognostic Factor + Outcome +- diagnostic_reasoning: 关键词是否覆盖 Clinical Presentation + 鉴别诊断方向 +- direct_answer: 不经过 Acquire 阶段,此项标注为 NA + +- `CLEAR`:检索策略直接来源于结构化查询,关键词与对应框架维度对应明确 +- `WEAK`:关联存在但不够紧密,检索词覆盖了主要维度但存在跳跃或遗漏 +- `BROKEN`:检索策略与结构化查询脱节,检索了完全无关的主题 +- `NA`:route_type 为 direct_answer,不适用 +``` + +--- + +## 改动三:新增 `route_confidence_noted` 审计项 + +### 在 `## 1. 回答完整性审计` 中新增 + +``` +**route_confidence_noted**:若 route_confidence=low(Ask 阶段因路由不确定而降级处理), +最终回答是否已注明路由不确定性带来的局限? +- `YES`:输出中明确提及路由不确定性或结构化框架的局限,提示需结合临床判断 +- `NO`:route_confidence=low 但输出未有任何提示(可能给用户错误的置信感) +- `NA`:route_confidence=normal,路由无不确定性,此项不适用 + +注意:route_confidence=low 时,Apply agent 应已自动追加 caveat( +"本问题的结构化框架存在路由不确定性(Ask 阶段降级处理),推荐结论需结合临床判断")。 +此处验证该 caveat 是否确实出现在最终输出中。 +``` + +--- + +## 改动四:输出格式更新 + +```json +{ + "completeness_audit": { + "original_question_answered": "YES | PARTIAL | NO", + "evidence_limitations_stated": "YES | NO | NA", + "route_confidence_noted": "YES | NO | NA" + }, + "chain_audit": { + "ask_to_acquire_link": "CLEAR | WEAK | BROKEN | NA", + "acquire_to_appraise_link": "CLEAR | WEAK | BROKEN", + "appraise_to_apply_link": "CLEAR | WEAK | BROKEN" + }, + "consistency_audit": { + "grade_to_strength_consistent": "YES | MINOR_ISSUE | MAJOR_CONTRADICTION", + "no_internal_contradictions": "YES | MINOR_ISSUE | MAJOR_CONTRADICTION" + }, + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | degraded" +} +``` + +--- + +## `_score_assess` 评分说明(无需改动) + +`route_confidence_noted=NO` 时(即 route_confidence=low 但输出无提示),属于 minor issue,影响 `completeness` 维度得分,不构成一票否决,现有评分逻辑可直接处理。 + +--- + +## 文件改动清单 + +| 文件 | 改动类型 | 说明 | +|---|---|---| +| `src/config/prompts/judge/assess_judge.txt` | 修改 | 输入加 `{route_type}` / `{route_confidence}` / `{ebm_query}`;`ask_to_acquire_link` 描述动态化(含各路由审计重点 + direct_answer NA);新增 `route_confidence_noted` 审计项;输出改为 `failures` + `overall_quality` 统一框架 | + +--- + +## 明确不在本次范围内 + +- `_score_assess` Python 侧权重调整 +- Assess Agent 本身的逻辑改动 +- `ambiguity_flag=YES` 时的 UI 提示(仅写入日志) diff --git a/docs/superpowers/specs/2026-04-22-judge-rubrics-redesign.md b/docs/superpowers/specs/2026-04-22-judge-rubrics-redesign.md new file mode 100644 index 0000000..a29c454 --- /dev/null +++ b/docs/superpowers/specs/2026-04-22-judge-rubrics-redesign.md @@ -0,0 +1,419 @@ +# Judge Rubrics 重设计规范 + +**日期**: 2026-04-22 +**范围**: Ask / Acquire / Appraise / Apply 四个阶段的 Judge 评分架构重设计 +**不在本次范围内**: Assess 阶段 Judge;`_score_*` Python 侧的具体实现代码;prompt 文件的逐字改写 + +--- + +## 一、整体架构 + +### 设计动机 + +原架构中 LLM Judge 输出 YES/PARTIAL/NO 分类标签,Python 侧将其映射为连续分数(如 PARTIAL→0.4)。这层映射对 LLM 和人工标注者均不透明,导致: + +1. LLM 不知道自己的 PARTIAL 会被算成多少分,判断标准模糊 +2. 标注数据集验证时,人工标注者无法复现评分逻辑 + +新架构采用 **Gate + Weighted Rubrics**: + +- **Gate(一票否决)**:任一 gate 项 = NO → 整体 fail,跳过评分,直接触发对应决策动作 +- **Weighted Rubrics**:每条 rubric 有固定权重(Critical=3 / Major=2 / Minor=1),YES=满分,PARTIAL=满分×0.5,NO=0 +- **总分** = Σ(得分) / Σ(满分),≥ 0.7 → pass + +### 执行流程 + +``` +LLM Judge + ↓ 输出每条 rubric 的 YES / PARTIAL / NO +Python 侧 + ↓ Step 1: Gate 检查(任一 gate rubric = NO → 立即 fail,跳过评分) + ↓ Step 2: Weighted rubric 评分(YES=满分, PARTIAL=半分, NO=0) + ↓ Step 3: 总分 = Σ(得分) / Σ(满分),≥ 0.7 → pass +决策模型 + ↓ 读取 gate 失败项 / 低分 rubric 群 → 生成定向 retry 指令 +``` + +### 分值体系 + +| 类型 | 权重 | YES | PARTIAL | NO | +|---|---|---|---|---| +| Gate(一票否决) | 不参与评分 | 通过 | 不存在 | 整体 fail | +| Critical rubric | 3 | 3分 | 1.5分 | 0分 | +| Major rubric | 2 | 2分 | 1分 | 0分 | +| Minor rubric | 1 | 1分 | 0.5分 | 0分 | + +### 标注数据集友好性原则 + +每条 rubric 在 prompt 中必须包含三行明确标准: +- **YES 标准**:明确的通过条件 +- **PARTIAL 标准**:明确的部分通过条件(不是"大致符合",而是具体描述) +- **NO 标准**:明确的不通过条件 + +人工标注者和 LLM 面对同一套判断标准,可直接对比输出结果以验证 LLM Judge 的忠实度。 + +--- + +## 二、Ask 阶段 + +### Gate 项 + +| Gate | YES | NO | +|---|---|---| +| `intent_not_distorted` | 结构化结果忠实反映原问题意图(方向性正确:人群、问题类型) | 方向性错误(问儿童→写成人;问治疗→写诊断) | +| `route_correct` | route_type 与问题类型匹配 | 明显错误(诊断准确性问题路由为 ebm_pico) | + +`direct_answer` 路由额外 gate: + +| Gate | YES | NO | +|---|---|---| +| `nonresearch_classification_correct` | 三条触发条件全部满足(立即操作性指导 + 延迟危及生命 + 公认标准流程) | 任一条件不满足,应重路由到 EBM 流程 | + +### Rubric 评分项(仅适用于 EBM 路由,direct_answer 不评分) + +**Critical(满分3)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `core_dimensions_present` | P + 主焦点维度(I/IndexTest/Exposure/PF)+ O 均有实质内容 | 三者中有一个描述极度模糊但方向正确 | 任一核心维度完全缺失或填写错误 | + +**Major(满分2)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `secondary_dimensions_present` | 次要维度(C/R/TH)按路由要求填写,原问题未涉及的填 NA | 次要维度有轻微偏差但不影响检索方向 | 次要维度明显错误(如 PIRD 的 R 字段填了干预措施) | + +**Minor(满分1)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `statement_unambiguous` | 表述无歧义,可直接用于检索 | 有轻微歧义但不影响检索方向 | 严重歧义,检索方向不确定 | + +### 满分计算 + +EBM 路由:Critical(3) + Major(2) + Minor(1) = **6分满分** + +### 决策模型 + +``` +route_type = direct_answer + nonresearch_classification_correct = YES → terminate(流程终止,直接回答) + nonresearch_classification_correct = NO → retry_route(重路由到 ebm_pico) + +route_type = ebm_* + Gate 失败 + intent_not_distorted = NO → retry,指令:重新理解原问题意图,不得改变人群/问题类型 + route_correct = NO → retry_route,指令:重新判断问题类型并选择正确路由框架 + + 评分 < 0.7(无 gate 失败) + core_dimensions_present 低 → retry_structure,指令:补全缺失的核心维度 + secondary_dimensions 低 → retry_structure,指令:修正次要维度 + 超过 max_retry → fallback:强制 ebm_pico,写入 route_confidence=low + + 评分 ≥ 0.7 且无 gate 失败 → proceed +``` + +--- + +## 三、Acquire 阶段 + +### Gate 项 + +| Gate | YES | NO | +|---|---|---| +| `search_terms_valid` | 检索词方向正确,能对应到 PICO/PIRD/PEO/Prognosis 的核心概念 | 检索词方向完全错误(如问心衰治疗却检索肾功能指标) | + +### 特殊路径:evidence_gap + +检索词有效但结果为零(`search_exhausted=true`)→ 不触发 gate,直接 proceed,写入 `evidence_gap_detected=true`,跳过评分。 + +### Rubric 评分项 + +**Critical(满分3)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `keywords_cover_pico_dimensions` | 关键词覆盖 P + 主焦点维度(I/IndexTest/Exposure/PF),且至少含一个可在 MeSH 验证的标准词 | 覆盖了 P 或主焦点之一,但另一维度无对应关键词;或有覆盖但无 MeSH 标准词 | 关键词全部指向同一概念,未覆盖多个维度 | +| `primary_focus_match` | 证据中的核心干预/暴露/测试与查询主焦点维度精准匹配 | 同类方法但有差异(不同剂量/版本),相关性高 | 完全不同的测试/干预/暴露 | +| `outcome_match` | 证据报告了临床关心的直接结局指标 | 报告了代理指标或部分相关结局 | 未报告任何相关结局 | + +**Major(满分2)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `keywords_have_synonyms` | 核心概念有同义词/变体(如 SGLT2i + empagliflozin + dapagliflozin) | 有部分同义词但不完整 | 无任何同义词扩展,仅有单一术语 | +| `keywords_count_sufficient` | 关键词数量 ≥ 5 个 | 3-4 个 | ≤ 2 个 | +| `study_design_matches_route` | 纳入文献的研究设计与 route_type 的优先级匹配(见下方匹配表) | 有次优先级文献但无第一优先级,或混入少量不匹配设计 | 大量纳入与 route_type 不匹配的研究设计 | +| `population_match` | 证据中的研究人群与查询 Patient 匹配(年龄段、疾病状态) | 有轻微差异(年龄范围略不同),结论可审慎外推 | 严重不匹配(成人证据用于儿科;完全不同疾病) | + +**Minor(满分1)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `top_selection_appropriate` | 排名靠前的文献是相关性最高、研究设计级别最高的 | 排序有轻微偏差,个别文献位置不最优 | 排名靠前的文献明显不如排名靠后的文献 | +| `selection_count_appropriate` | 选取数量合理(有效候选多时选足,质量差时不强行凑数) | 数量略多或略少,但不影响后续评价 | 明显不合理(大量高质量候选却只选1-2篇,或质量极差仍凑满10篇) | +| `key_sentences_present` | Top 文章的 key_sentences 非空,RAG 流程正常执行 | 部分文章 key_sentences 为空(摘要极短导致 chunk 失败) | 所有文章 key_sentences 均为空,RAG 流程可能失败 | + +### 研究设计与 route_type 匹配表 + +| route_type | 第一优先级 | 第二优先级 | 第三优先级 | 通常排除 | +|---|---|---|---|---| +| ebm_pico(治疗) | SR/Meta分析(基于RCT) | RCT | 观察性研究 | 机制综述、专家意见、病例报告 | +| ebm_pird(诊断) | SR/Meta分析(基于诊断准确性研究) | 诊断准确性研究(横断面) | 回顾性诊断研究 | 机制综述、治疗类RCT | +| ebm_peo(病因/危害) | SR/Meta分析(基于观察性研究) | 前瞻性队列研究 | 病例对照研究 | RCT、机制综述 | +| ebm_prognosis(预后) | SR/Meta分析(基于队列研究) | 前瞻性队列研究 | 回顾性队列研究 | 机制综述、病例报告 | + +### 满分计算 + +Critical(3×3) + Major(2×4) + Minor(1×3) = **9 + 8 + 3 = 20分满分** + +### 决策模型 + +``` +evidence_gap_detected = true → proceed(标记 evidence_gap,Apply 阶段处理) + +Gate 失败 + search_terms_valid = NO → retry,指令:根据 PICO/PIRD/PEO/Prognosis 重新构建检索词 + +评分 < 0.7(无 gate 失败) + keywords_* 低 → retry,指令:补充同义词/MeSH词/覆盖缺失维度 + primary_focus_match / outcome_match 低 → retry,指令:调整检索词以匹配主焦点和结局 + study_design_matches_route 低 → retry,指令:调整研究设计过滤器 + population_match = NO → backtrack 到 Ask,指令:重新确认 Patient 维度定义 + 超过 max_retry → proceed(降级,写入 evidence_quality_warning) + +评分 ≥ 0.7 且无 gate 失败 → proceed +``` + +--- + +## 四、Appraise 阶段 + +### 两层架构 + +Appraise Judge 分两层执行,Layer 1 通过则不调用 LLM。 + +#### Layer 1:Python 硬编码校验(Gate 等价) + +| 检查项 | 通过条件 | 失败动作 | +|---|---|---| +| `all_studies_have_study_type` | 每篇文献都有 study_type 字段且值合法 | 触发 Layer 2 LLM Judge | +| `all_studies_have_rob_fields` | 每篇文献都有 risk_of_bias 字段 | 触发 Layer 2 LLM Judge | +| `grade_inputs_complete` | GRADE 计算所需字段无缺失 | 触发 Layer 2 LLM Judge | +| `grade_output_in_legal_range` | 最终等级在 {High/Moderate/Low/Very Low} 内 | 抛出系统异常,不重试 | + +全部通过 → 直接 proceed,不调 LLM Judge。 + +#### Layer 2:LLM Judge Gate 项 + +| Gate | YES | NO | +|---|---|---| +| `study_type_correct` | 所有研究的 study_type 识别正确 | 存在明显错误(观察性研究标记为 RCT) | +| `computed_grade_reasonable` | 计算出的 GRADE 等级与基于摘要的独立判断一致 | 明显不合理(通常是 study_type 或降级因素错误导致) | + +注意:以下情况属于**合理结果**,`computed_grade_reasonable` 不应判断为 NO: +- SR/MA 纳入观察性研究(included_study_type=OBSERVATIONAL)→ 初始分为 Low,即使无降级因素也可能输出 Low/Very Low +- COHORT/CASE_CONTROL 存在 SERIOUS 偏倚时,即使 large_effect=YES 也不升级 +- CROSS_SECTIONAL 无升级因素 → 最高只能到 Low + +### Rubric 评分项 + +**Critical(满分3)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | 适用条件 | +|---|---|---|---|---| +| `downgrade_factors_appropriate` | 四个降级因素(risk_of_bias/inconsistency/indirectness/imprecision)的严重程度标注与摘要信息相符 | 整体合理,个别因素有轻微偏差(过宽或过严) | 存在明显错误(未盲法 RCT 标记为 NOT_SERIOUS 偏倚风险) | 始终 | +| `included_study_type_correct` | SR/MA/NMA 的 included_study_type 与摘要描述的纳入研究类型相符 | 摘要信息不足以确认(填 UNKNOWN 是合理选择) | 明显错误(摘要写"仅纳入RCT"但标注为 OBSERVATIONAL) | 仅当证据列表含 SR/MA/NMA | + +**Major(满分2)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | 适用条件 | +|---|---|---|---|---| +| `upgrade_factors_appropriate` | 升级因素(large_effect/dose_response/confounding_bias_mitigates)标注与摘要信息相符 | 整体合理,个别因素有轻微偏差 | 明显错误(无剂量效应数据但标注 dose_response=YES) | 仅当证据列表含 COHORT/CASE_CONTROL | +| `upgrade_blocked_appropriate` | 存在 SERIOUS/VERY_SERIOUS 偏倚时,upgrade_blocked_by_bias=True 且最终等级未因升级因素提升 | — | 存在严重偏倚但升级因素仍被计入 | 仅当含 COHORT/CASE_CONTROL 且 risk_of_bias=SERIOUS/VERY_SERIOUS | +| `conflicts_identified` | 证据间存在实质性冲突时,冲突被正确识别并描述 | 冲突识别不完整,遗漏了部分冲突说明 | 存在明显冲突但完全未识别 | 始终 | + +**Minor(满分1)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | 适用条件 | +|---|---|---|---|---| +| `numerical_data_extracted` | 摘要中存在效应量/CI/P值时均被提取 | 部分提取,有遗漏但不影响 GRADE 结论 | 存在数值数据但完全未提取 | 始终 | + +### 满分计算(最大情形) + +Critical(3×2) + Major(2×3) + Minor(1×1) = **6 + 6 + 1 = 13分满分**(NA 项不参与分母) + +### 决策模型 + +``` +Layer 1 全部通过 → proceed(不调 LLM Judge) + +Layer 1 失败 → 触发 LLM Judge + grade_output_in_legal_range 失败 → 系统异常,终止 + + LLM Judge Gate 失败 + study_type_correct = NO → retry(重新执行整个 Appraise) + computed_grade_reasonable = NO → retry(重新执行整个 Appraise) + + LLM Judge 定位问题根因 + 某篇文献字段缺失 + 根因 = LLM漏读 → 重新提取该文献,回到 Appraise 重算 + 某篇文献字段缺失 + 根因 = 文献本身不足 → 标记该文献剔除,回到 Appraise 重算 + + 评分 < 0.7(无 gate 失败) + downgrade_factors 低 → retry,指令:重新评估指定降级因素 + conflicts_identified 低 → retry,指令:补充冲突识别 + + 所有文献标记"信息不足"后 GRADE = Very Low 且文献数量不足 + → backtrack 到 Acquire,指令:扩大检索范围 + + 评分 ≥ 0.7 且无 gate 失败 → proceed +``` + +--- + +## 五、Apply 阶段 + +### Gate 项 + +| Gate | YES | NO | +|---|---|---| +| `recommendation_grounded_in_evidence` | 推荐意见基于本次检索的证据,方向与证据一致 | 推荐与证据无关或方向相反 | +| `route_dimension_consistent` | Apply 的维度一致性检查使用了与 route_type 匹配的框架(PICO/PIRD/PEO/Prognosis) | 使用了错误框架(如 PIRD 问题用 PICO 框架,Index Test 被映射为 Intervention) | +| `strength_not_grossly_inflated` | 推荐强度未严重超出证据上限 | Very Low 或 Low 证据给出 Strong 推荐,或有充分高质量证据却输出 No Recommendation | + +### Rubric 评分项 + +**Critical(满分3)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `effect_size_correctly_reported` | 效应量、置信区间、GRADE 等级被正确转述,无数据失真 | 数值基本正确,有轻微表述偏差但不影响结论方向 | 效应量或 GRADE 等级被错误转述,导致结论方向改变 | +| `strength_matches_evidence` | 推荐强度与证据等级严格匹配(含 inconsistency=SERIOUS 时 Moderate→Weak 的降强推荐属正确行为) | 有轻微偏差(如 Moderate 证据给 Strong,但结果高度一致且无 inconsistency 问题),临床上可接受 | 推荐强度与证据等级明显不符(不触发 gate 的中等程度不匹配) | + +**Major(满分2)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `population_applicability_addressed` | 明确说明证据人群与当前患者的匹配程度,包括可外推性或外推限制 | 有提及人群差异但说明不充分 | 完全未讨论人群适配性 | +| `uncertainty_source_explained` | 不确定性的来源被明确说明(如样本量不足、间接证据、研究设计局限) | 提及了不确定性但未说明来源 | 未提及不确定性,或仅说"证据有限"而无来源说明 | +| `citation_traceable` | 推荐依据有文献溯源(PMID 或标题可追溯) | 部分推荐有溯源,部分缺失 | 无任何文献溯源 | + +**Minor(满分1)** + +| Rubric | YES 标准 | PARTIAL 标准 | NO 标准 | +|---|---|---|---| +| `recommendation_specific` | 推荐内容具体,临床医生可据此执行(含适应症、关键参数等) | 推荐方向明确但缺少关键细节 | 推荐过于模糊,无法指导临床决策 | +| `patient_preference_considered` | 患者偏好或价值观被纳入推荐表述(或明确说明不适用) | 有提及但表述笼统 | 完全未提及患者偏好 | + +### 满分计算 + +Critical(3×2) + Major(2×3) + Minor(1×2) = **6 + 6 + 2 = 14分满分** + +### 决策模型 + +``` +Gate 失败 + recommendation_grounded_in_evidence = NO + → retry,指令:严格基于本次检索证据重新生成推荐,不得引入外部知识 + + route_dimension_consistent = NO + → retry,指令:按 {route_type} 对应框架重新执行维度一致性检查 + + strength_not_grossly_inflated = NO + → retry,指令:依据 GRADE 原则重新确定推荐强度 + +评分 < 0.7(无 gate 失败) + effect_size_correctly_reported 低 + → retry,指令:修正数据转述,对照 Appraise 输出逐项核查效应量和 GRADE 等级 + + strength_matches_evidence 低 + → retry,指令:加强推荐强度约束 + + strength_matches_evidence = PARTIAL 且推荐强度 < 证据下限(过度保守) + → backtrack 到 Appraise,指令:重新检查 GRADE 评估是否存在隐含降级 + + population_applicability / uncertainty_source 低 + → retry,指令:补充外推性分析和不确定性来源说明 + + citation_traceable 低 + → retry,指令:补充文献溯源 + + clinical_fit 低且根因 = 证据根本不适用当前患者 + → backtrack 到 Acquire,指令:检索更匹配的文献(Judge 需说明不适用的具体原因) + +超过 max_retry + → 输出"当前证据不足以形成推荐意见"+ 证据摘要(合法终止路径) + +评分 ≥ 0.7 且无 gate 失败 → proceed(输出最终推荐) +``` + +--- + +## 六、标注数据集设计说明 + +### 验证目标 + +通过人工标注 vs LLM Judge 输出的对比,验证 LLM Judge 是否能忠实执行上述 rubric 规则。 + +### 标注样本结构 + +每个标注样本包含: + +```json +{ + "stage": "Ask | Acquire | Appraise | Apply", + "input": { + "original_question": "...", + "stage_output": { ... }, + "context": { ... } + }, + "rubric_labels": { + "gate_items": { + "intent_not_distorted": "YES | NO", + ... + }, + "scored_rubrics": { + "core_dimensions_present": "YES | PARTIAL | NO", + ... + } + }, + "overall_verdict": "pass | fail | gate_fail", + "annotator_notes": "..." +} +``` + +### 标注质量保障 + +1. **Gate 项只有 YES/NO**:标注者无需判断程度,降低歧义 +2. **每条 rubric 有三行明确标准**:YES/PARTIAL/NO 标准均有具体描述,不依赖标注者主观判断 +3. **NA 项明确标注**:适用条件不满足时标注 NA,不参与一致性计算 +4. **分歧处理**:Gate 项分歧 → 讨论解决;Scored rubric 分歧 → 允许 ±1 级(如一人 YES 一人 PARTIAL)视为一致 + +### 一致性指标 + +- Gate 项:Cohen's κ(二分类) +- Scored rubrics:Weighted κ(三分类 YES/PARTIAL/NO) +- 目标:κ ≥ 0.7(substantial agreement) + +--- + +## 七、与现有代码的对接说明 + +### `judge_llm.py` 改动方向 + +| 函数 | 改动 | +|---|---| +| `_score_ask` | 按新 rubric 体系重写;增加 gate 检查;移除 keywords 相关评分 | +| `_score_acquire` | 按新 rubric 体系重写;增加 keywords 评分(从 Ask 迁移);字段名更新(`primary_focus_match` 替代 `pico_i_match`) | +| `_score_appraise` | 增加 Layer 1 Python 校验前置;Layer 2 LLM Judge 按新 rubric 重写 | +| `_score_apply` | 按新 rubric 体系重写;增加 gate 检查 | +| `STAGE_WEIGHTS` | 替换为 rubric 权重表(Critical=3/Major=2/Minor=1) | + +### prompt 文件改动方向 + +每个阶段的 `*_judge.txt` 需要: +1. 将每条 rubric 以独立段落呈现,包含 YES/PARTIAL/NO 三行标准 +2. Gate 项单独列出,明确标注"一票否决" +3. 输出格式统一为 `gate_results` + `rubric_results` + `failures` + `overall_quality` + +具体 prompt 改写不在本规范范围内,由实现阶段处理。 diff --git a/src/agents/acquire_agent.py b/src/agents/acquire_agent.py index 09b7933..d33a65b 100644 --- a/src/agents/acquire_agent.py +++ b/src/agents/acquire_agent.py @@ -1,9 +1,14 @@ -from typing import List, Dict, Any +from __future__ import annotations + +import threading import time +from concurrent.futures import ThreadPoolExecutor from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + from src.agents.base import BaseAgent, robust_parse_json -from src.state.schema import WorkflowState, Evidence -from src.tools.pubmed_api import search_pubmed +from src.state.schema import EBMQuery, WorkflowState, Evidence +from src.tools.pubmed_api import fetch_pmc_full_text, search_pubmed from src.tools.local_evidence_db import search_local # Cochrane Handbook Highly Sensitive Search Strategy (HSSS) — @@ -43,9 +48,38 @@ "Harm": _OBSERVATIONAL_FILTER, } +# Map EBMQuery route_type to the appropriate PubMed filter +_FILTER_BY_ROUTE_TYPE = { + "ebm_pico": _HSSS_FILTER, + "ebm_peo": _OBSERVATIONAL_FILTER, + "ebm_pird": _DTA_FILTER, + "ebm_prognosis": _OBSERVATIONAL_FILTER, + "full_pipeline": _HSSS_FILTER, # default for generic full_pipeline +} + # Number of top-K articles to select via listwise ranking. _TOP_K = 10 +# --------------------------------------------------------------------------- +# Lazy-loaded sentence-transformer for RAG reranking +# --------------------------------------------------------------------------- +_embedding_model = None +_embedding_lock = threading.Lock() + + +def _get_embedding_model(): + """Return a shared SentenceTransformer instance (thread-safe lazy init).""" + global _embedding_model # noqa: PLW0603 + if _embedding_model is None: + with _embedding_lock: + if _embedding_model is None: + try: + from sentence_transformers import SentenceTransformer # noqa: PLC0415 + _embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + except Exception: + _embedding_model = None # graceful degradation + return _embedding_model + class AcquireAgent(BaseAgent): """ @@ -53,11 +87,12 @@ class AcquireAgent(BaseAgent): Evidence selection pipeline: 1. LLM builds a PubMed Boolean search query (acquire_agent.txt). - 2. PubMed API returns up to 20 candidates (HSSS filter applied first). - 3. Keyword-based study type inference runs on all candidates. - 4. LLM performs Listwise ranking: given the full candidate list, it - selects and ranks the Top-K most relevant articles in one pass. - 5. Rank-normalised relevance scores are assigned (rank 1 → 1.0). + 2. PubMed API returns up to 20 candidates (filter chosen by route_type). + 3. PMC full-text is fetched in parallel for open-access articles. + 4. BM25 + Embedding RAG extracts key sentences from full-text articles. + 5. Keyword-based study type inference runs on all candidates. + 6. LLM performs Listwise ranking → Top-K selection. + 7. Full-text articles are promoted to the front of the ranked list. """ def __init__(self, llm, tools: List[Any] = None): @@ -104,11 +139,96 @@ def _use_local_db(self, question_type: str = "Therapy") -> bool: """ return True - def _apply_search_filter(self, query: str, question_type: str = "Therapy") -> str: - """Wrap query with an appropriate filter based on question type.""" - search_filter = _FILTER_BY_QUESTION_TYPE.get(question_type, _HSSS_FILTER) + def _apply_search_filter(self, query: str, question_type: str = "Therapy", route_type: str = "") -> str: + """Wrap query with an appropriate filter based on route_type (preferred) or question_type.""" + if route_type and route_type in _FILTER_BY_ROUTE_TYPE: + search_filter = _FILTER_BY_ROUTE_TYPE[route_type] + else: + search_filter = _FILTER_BY_QUESTION_TYPE.get(question_type, _HSSS_FILTER) return f"({query}) AND {search_filter}" + def _fetch_full_texts(self, candidates: List[Evidence]) -> None: + """Fetch PMC full text for open-access articles in parallel (in-place). + + Only articles with a pmcid are attempted. Results are written directly + to evidence.full_text and evidence.has_full_text. + """ + pmc_candidates = [e for e in candidates if e.pmcid] + if not pmc_candidates: + return + + def _fetch_one(ev: Evidence) -> None: + try: + text = fetch_pmc_full_text(ev.pmid) + if text: + ev.full_text = text + ev.has_full_text = True + except Exception: + pass # non-fatal — abstract-only fallback is fine + + with ThreadPoolExecutor(max_workers=8) as pool: + list(pool.map(_fetch_one, pmc_candidates)) + + n_fetched = sum(1 for e in pmc_candidates if e.has_full_text) + print(f"[DEBUG] PMC full-text fetched: {n_fetched}/{len(pmc_candidates)}") + + def _rag_extract( + self, evidence: Evidence, query_terms: List[str] + ) -> Tuple[str, float]: + """Extract key sentences from full text using BM25 → Embedding rerank. + + Pipeline: + 1. Split full_text into sentences. + 2. BM25 retrieves top-8 candidate sentences. + 3. Embedding model reranks to top-3 by cosine similarity to query. + + Returns (key_sentences_str, relevance_boost) where relevance_boost is + the mean cosine similarity of the top-3 sentences (0.0 if unavailable). + Falls back to abstract if full_text is absent. + """ + text = evidence.full_text or evidence.abstract or "" + if not text: + return "", 0.0 + + # Split into sentences (simple heuristic — good enough for abstracts/paragraphs) + import re + sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if len(s.strip()) > 20] + if not sentences: + return text[:500], 0.0 + + query_str = " ".join(query_terms) + + # BM25 retrieval + try: + from rank_bm25 import BM25Okapi + tokenised = [s.lower().split() for s in sentences] + bm25 = BM25Okapi(tokenised) + scores = bm25.get_scores(query_str.lower().split()) + top8_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:8] + top8 = [sentences[i] for i in top8_idx] + except Exception: + top8 = sentences[:8] + + # Embedding rerank to top-3 + model = _get_embedding_model() + if model is not None and len(top8) > 1: + try: + import numpy as np + query_emb = model.encode([query_str], normalize_embeddings=True)[0] + sent_embs = model.encode(top8, normalize_embeddings=True) + sims = sent_embs @ query_emb + top3_idx = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:3] + top3 = [top8[i] for i in top3_idx] + boost = float(np.mean([sims[i] for i in top3_idx])) + except Exception: + top3 = top8[:3] + boost = 0.0 + else: + top3 = top8[:3] + boost = 0.0 + + return " … ".join(top3), boost + def _infer_study_type(self, evidence: Evidence) -> str: """Infer study type from title and abstract using keyword rules.""" text = f"{evidence.title} {evidence.abstract or ''}".lower() @@ -215,10 +335,33 @@ def _listwise_rank( return result def execute(self, state: WorkflowState) -> Dict[str, Any]: - """Execute Acquire agent: build query → search PubMed → listwise rank.""" + """Execute Acquire agent: build query → search → full-text → RAG → listwise rank.""" + # Prefer EBMQuery (new routing); fall back to legacy PICOQuery + ebm_query: Optional[EBMQuery] = state.get("ebm_query") pico = state.get("pico_query") - if not pico: - raise ValueError("No PICO query found in state") + + if ebm_query is None and pico is None: + raise ValueError("No EBMQuery or PICOQuery found in state") + + # Derive a unified pico_dict for the ranking prompt (always needed) + if ebm_query is not None: + pico_dict = { + "patient": ebm_query.patient, + "intervention": ebm_query.primary_focus, + "comparison": ebm_query.comparator or "", + "outcome": ebm_query.outcome, + } + query_keywords = ebm_query.keywords + route_type = ebm_query.query_type # e.g. "pico", "pird", "peo", "prognosis" + else: + pico_dict = { + "patient": pico.patient, + "intervention": pico.intervention, + "comparison": pico.comparison, + "outcome": pico.outcome, + } + query_keywords = pico.keywords + route_type = state.get("route_type") or "" backtrack_context = "" if state.get("backtrack_reason"): @@ -229,11 +372,11 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: # Step 1: LLM builds Boolean search query prompt = self.prompt_template.format( - patient=pico.patient, - intervention=pico.intervention, - comparison=pico.comparison, - outcome=pico.outcome, - keywords=", ".join(pico.keywords), + patient=pico_dict["patient"], + intervention=pico_dict["intervention"], + comparison=pico_dict["comparison"], + outcome=pico_dict["outcome"], + keywords=", ".join(query_keywords), backtrack_context=backtrack_context, ) t0 = time.time() @@ -257,9 +400,12 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: print(f"[DEBUG] Local DB returned {len(raw_results)} articles") print(f"[TIMING] Local DB search: {time.time()-t0:.1f}s") else: - filtered_query = self._apply_search_filter(base_query, question_type) + filtered_query = self._apply_search_filter( + base_query, question_type=question_type, route_type=route_type + ) print( - f"[DEBUG] question_type={question_type}, filtered query: {filtered_query}" + f"[DEBUG] route_type={route_type}, question_type={question_type}, " + f"filtered query: {filtered_query}" ) raw_results = search_pubmed(query=filtered_query, max_results=20) print(f"[DEBUG] PubMed (filtered) returned {len(raw_results)} articles") @@ -282,40 +428,54 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: "error": str(e), } - # Step 3: Infer study type for all candidates (used as hint in ranking prompt) + # Step 3: Fetch PMC full texts in parallel for open-access articles + t0 = time.time() + self._fetch_full_texts(raw_results) + print(f"[TIMING] PMC full-text fetch: {time.time()-t0:.1f}s") + + # Step 4: RAG extract key sentences for full-text articles + rag_query_terms = query_keywords or base_query.split()[:10] + for ev in raw_results: + if ev.has_full_text and ev.full_text: + key_sents, boost = self._rag_extract(ev, rag_query_terms) + ev.key_sentences = key_sents + # Slightly boost relevance score for full-text articles (applied after ranking) + ev._rag_boost = boost # type: ignore[attr-defined] + + # Step 5: Infer study type for all candidates (used as hint in ranking prompt) for evidence in raw_results: evidence.study_type = self._infer_study_type(evidence) print(f"[DEBUG] Study types inferred for {len(raw_results)} candidates") - # Step 4: LLM Listwise ranking → Top-K selection - pico_dict = { - "patient": pico.patient, - "intervention": pico.intervention, - "comparison": pico.comparison, - "outcome": pico.outcome, - } - + # Step 6: LLM Listwise ranking → Top-K selection t0 = time.time() selected = self._listwise_rank(raw_results, pico_dict, top_k=_TOP_K) print(f"[TIMING] Listwise ranking LLM: {time.time()-t0:.1f}s") - for i, e in enumerate(selected): + # Step 7: Promote full-text articles to the front (stable sort) + full_text_first = sorted(selected, key=lambda e: 0 if e.has_full_text else 1) + + for i, e in enumerate(full_text_first): + ft_flag = "[FT]" if e.has_full_text else "" print( - f"[DEBUG] Rank {i + 1}: score={e.relevance_score:.3f}, " + f"[DEBUG] Rank {i + 1}{ft_flag}: score={e.relevance_score:.3f}, " f"type={e.study_type}, title={e.title[:80]}..." ) - print(f"[DEBUG] Listwise selected {len(selected)}/{len(raw_results)} articles") + print( + f"[DEBUG] Listwise selected {len(full_text_first)}/{len(raw_results)} articles " + f"({sum(1 for e in full_text_first if e.has_full_text)} with full text)" + ) study_type_distribution: Dict[str, int] = {} - for e in selected: + for e in full_text_first: t = e.study_type or "Unknown" study_type_distribution[t] = study_type_distribution.get(t, 0) + 1 return { - "evidence_list": selected, + "evidence_list": full_text_first, "search_query": search_query_used, "total_results": len(raw_results), - "selected_count": len(selected), + "selected_count": len(full_text_first), "study_type_distribution": study_type_distribution, } diff --git a/src/agents/apply_agent.py b/src/agents/apply_agent.py index 214a841..17892e4 100644 --- a/src/agents/apply_agent.py +++ b/src/agents/apply_agent.py @@ -1,7 +1,75 @@ -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional from pathlib import Path from src.agents.base import BaseAgent, robust_parse_json -from src.state.schema import WorkflowState, Recommendation +from src.state.schema import WorkflowState, Recommendation, EBMQuery, PICOQuery + + +def _format_ebm_query(ebm_query: EBMQuery) -> str: + """Format an EBMQuery into a concise human-readable description.""" + parts = [f"类型: {ebm_query.query_type}"] + parts.append(f"患者/人群: {ebm_query.patient}") + parts.append(f"主要关注点: {ebm_query.primary_focus}") + if ebm_query.comparator: + parts.append(f"对照: {ebm_query.comparator}") + if ebm_query.reference_standard: + parts.append(f"参考标准: {ebm_query.reference_standard}") + parts.append(f"结局: {ebm_query.outcome}") + if ebm_query.time_horizon: + parts.append(f"时间范围: {ebm_query.time_horizon}") + return "; ".join(parts) + + +def _format_pico_query(pico_query: PICOQuery) -> str: + """Format a PICOQuery into a concise human-readable description.""" + return ( + f"P: {pico_query.patient}; " + f"I: {pico_query.intervention}; " + f"C: {pico_query.comparison}; " + f"O: {pico_query.outcome}" + ) + + +def _summarize_downgrade_factors(grade_rationales: List[Dict]) -> Dict[str, Any]: + """ + Summarise key downgrade factors across all appraised studies. + + Returns a dict with: + - key_downgrade_factors: human-readable string listing the most common issues + - has_serious_inconsistency: bool — True when any study has inconsistency + rated SERIOUS or VERY_SERIOUS + """ + factor_counts: Dict[str, int] = {} + has_serious_inconsistency = False + + for r in grade_rationales: + for factor in ("risk_of_bias", "inconsistency", "indirectness", "imprecision"): + val = r.get(factor, "NOT_SERIOUS") + if val in ("SERIOUS", "VERY_SERIOUS"): + factor_counts[factor] = factor_counts.get(factor, 0) + 1 + if r.get("inconsistency") in ("SERIOUS", "VERY_SERIOUS"): + has_serious_inconsistency = True + if r.get("publication_bias") == "SUSPECTED": + factor_counts["publication_bias"] = factor_counts.get("publication_bias", 0) + 1 + + if not factor_counts: + key_downgrade_factors = "无主要降级因素" + else: + _label_map = { + "risk_of_bias": "偏倚风险", + "inconsistency": "不一致性", + "indirectness": "间接性", + "imprecision": "不精确性", + "publication_bias": "发表偏倚", + } + parts = [ + f"{_label_map.get(k, k)}({v}篇)" for k, v in sorted(factor_counts.items()) + ] + key_downgrade_factors = "、".join(parts) + + return { + "key_downgrade_factors": key_downgrade_factors, + "has_serious_inconsistency": has_serious_inconsistency, + } class ApplyAgent(BaseAgent): @@ -42,10 +110,34 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: ] ) + # --- Build structured query description --- + ebm_query = state.get("ebm_query") + pico_query = state.get("pico_query") + if ebm_query: + query_description = _format_ebm_query(ebm_query) + elif pico_query: + query_description = _format_pico_query(pico_query) + else: + query_description = question + + # --- Summarise downgrade factors from grade_rationales --- + grade_rationales: List[Dict] = state.get("grade_rationales") or [] + downgrade_summary = _summarize_downgrade_factors(grade_rationales) + key_downgrade_factors = downgrade_summary["key_downgrade_factors"] + has_serious_inconsistency = downgrade_summary["has_serious_inconsistency"] + + # --- Route type context --- + route_type = state.get("route_type") or "full_pipeline" + route_confidence: Optional[float] = state.get("route_confidence") + prompt = self.prompt_template.format( question=question, + query_description=query_description, + route_type=route_type, evidence_summary=evidence_summary, appraisal_summary=appraisal.summary, + key_downgrade_factors=key_downgrade_factors, + has_serious_inconsistency="YES" if has_serious_inconsistency else "NO", backtrack_context=backtrack_context, ) @@ -90,14 +182,24 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: llm_strength = rec_dict.get("strength", "Weak") if evidence_quality in ("Very Low", "Low") and llm_strength == "Strong": strength = "Weak" + elif has_serious_inconsistency and llm_strength == "Strong": + # Serious inconsistency across studies also blocks Strong recommendation + strength = "Weak" else: strength = llm_strength + # Build caveats list, appending route_confidence warning when confidence is low + caveats: List[str] = list(rec_dict.get("caveats", [])) + if route_confidence is not None and route_confidence < 0.7: + caveats.append( + f"路由置信度较低({route_confidence:.0%}),问题分类可能不准确,建议人工核实检索策略是否匹配临床问题类型。" + ) + recommendation = Recommendation( text=rec_dict["recommendation"], strength=strength, rationale=rec_dict["rationale"], - caveats=rec_dict.get("caveats", []), + caveats=caveats, evidence_quality=evidence_quality, ) diff --git a/src/agents/appraise_agent.py b/src/agents/appraise_agent.py index adf389a..21488ad 100644 --- a/src/agents/appraise_agent.py +++ b/src/agents/appraise_agent.py @@ -12,18 +12,32 @@ # Initial GRADE points by study type (4=High, 3=Moderate, 2=Low, 1=Very Low) _INITIAL_POINTS: Dict[str, int] = { "RCT": 4, - "SYSTEMATIC_REVIEW": 4, # Starts High (synthesizes RCTs or best available evidence) - "META_ANALYSIS": 4, # Starts High - "NMA": 4, # Network meta-analysis: starts High + "SYSTEMATIC_REVIEW": 4, # Dynamic: overridden by _SR_INITIAL_POINTS when included_study_type is known + "META_ANALYSIS": 4, + "NMA": 4, "COHORT": 2, "CASE_CONTROL": 2, "CROSS_SECTIONAL": 2, # Observational: starts Low - "NARRATIVE_REVIEW": 1, # Expert synthesis without systematic search: Very Low + "NARRATIVE_REVIEW": 1, "CASE_REPORT": 1, - "GUIDELINE": 3, # Typically based on SR: starts Moderate - "EXPERT_OPINION": 1, # No systematic search: Very Low + "GUIDELINE": 3, + "EXPERT_OPINION": 1, } +# For SR/MA/NMA: initial points depend on the type of included studies +_SR_INITIAL_POINTS: Dict[str, int] = { + "RCT": 4, + "OBSERVATIONAL": 2, + "MIXED": 3, + "UNKNOWN": 3, +} + +# Study types that are SRs/MAs (use _SR_INITIAL_POINTS when included_study_type is set) +_SR_TYPES = {"SYSTEMATIC_REVIEW", "META_ANALYSIS", "NMA"} + +# Only these observational study types are eligible for upgrade factors +_UPGRADE_STUDY_TYPES = {"COHORT", "CASE_CONTROL"} + # Mapping from GRADE codes to human-readable study type labels # (used to sync evidence.study_type with Appraise classification) _GRADE_CODE_TO_LABEL: Dict[str, str] = { @@ -70,29 +84,46 @@ def _compute_grade(appraisal: Dict) -> str: Deterministically compute GRADE level from LLM classification labels. Rules: - - Start from initial points based on study_type - - Deduct for each downgrade factor (risk_of_bias, inconsistency, - indirectness, imprecision, publication_bias) - - Add for upgrade factors (large_effect, dose_response) - only when study_type is observational (COHORT / CASE_CONTROL) - - Clamp result to [1, 4] and map to label + 1. Initial points: + - SR/MA/NMA: use _SR_INITIAL_POINTS keyed by included_study_type + (RCT→4, OBSERVATIONAL→2, MIXED→3, UNKNOWN→3); fall back to 4. + - All other types: use _INITIAL_POINTS. + 2. Downgrade for each factor (risk_of_bias, inconsistency, indirectness, + imprecision) and for suspected publication_bias. + 3. Upgrade (large_effect, dose_response) only when: + - study_type is in _UPGRADE_STUDY_TYPES (COHORT or CASE_CONTROL), AND + - risk_of_bias is NOT_SERIOUS (serious bias blocks upgrades). + Upgraded points are capped at min(points, 3) — observational evidence + cannot reach High (4) through upgrades alone. + 4. Clamp to [1, 4] and map to label. """ study_type = appraisal.get("study_type", "CASE_REPORT") - points = _INITIAL_POINTS.get(study_type, 1) - # Downgrade factors + # Step 1: initial points + if study_type in _SR_TYPES: + included = appraisal.get("included_study_type", "UNKNOWN") + points = _SR_INITIAL_POINTS.get(included, _SR_INITIAL_POINTS["UNKNOWN"]) + else: + points = _INITIAL_POINTS.get(study_type, 1) + + # Step 2: downgrade factors for factor in ("risk_of_bias", "inconsistency", "indirectness", "imprecision"): points -= _DOWNGRADE_PENALTY.get(appraisal.get(factor, "NOT_SERIOUS"), 0) if appraisal.get("publication_bias") == "SUSPECTED": points -= 1 - # Upgrade factors (observational studies only) - if study_type in ("COHORT", "CASE_CONTROL", "CROSS_SECTIONAL"): + # Step 3: upgrade factors — only for COHORT/CASE_CONTROL with no serious bias + if ( + study_type in _UPGRADE_STUDY_TYPES + and appraisal.get("risk_of_bias", "NOT_SERIOUS") == "NOT_SERIOUS" + ): if appraisal.get("large_effect") == "YES": points += 1 if appraisal.get("dose_response") == "YES": points += 1 + # Observational evidence cannot reach High (4) through upgrades alone + points = min(points, 3) points = max(1, min(4, points)) return _POINTS_TO_GRADE[points] diff --git a/src/agents/ask_agent.py b/src/agents/ask_agent.py index b0747ba..d9ccf6b 100644 --- a/src/agents/ask_agent.py +++ b/src/agents/ask_agent.py @@ -1,54 +1,404 @@ -from typing import List, Dict, Any +""" +AskAgent — clinical question triage and EBM query structuring. + +Routing flow: + 1. Router prompt → route_type: "direct_answer" | "full_pipeline" | "sub_questions" + 2a. direct_answer → DirectAnswer prompt → populate direct_answer_output, set should_terminate + 2b. sub_questions → decompose into sub-question list, recurse on first sub-question + 2c. full_pipeline → framework-specific prompt (PICO / PIRD / PEO / Prognosis) + Diagnosis questions run diag_step1 → diag_step2 before PIRD +""" + +from __future__ import annotations + +import logging from pathlib import Path +from typing import Any, Dict, List, Optional + from src.agents.base import BaseAgent, robust_parse_json -from src.state.schema import WorkflowState, PICOQuery +from src.state.schema import EBMQuery, PICOQuery, WorkflowState + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Prompt directory +# --------------------------------------------------------------------------- +_PROMPT_DIR = Path(__file__).parent.parent / "config" / "prompts" / "ask" + +# Map framework name → prompt file stem +_FRAMEWORK_PROMPT: Dict[str, str] = { + "pico": "ebm_pico", + "pird": "ebm_pird", + "peo": "ebm_peo", + "prognosis": "ebm_prognosis", + "diagnostic_reasoning": "ebm_pird", # fallback to PIRD for diagnostic_reasoning +} + +# Valid question types +_VALID_QUESTION_TYPES = {"Therapy", "Diagnosis", "Prognosis", "Harm", "Prevention", "Background", "Mixed"} + +# Valid route types +_VALID_ROUTE_TYPES = {"direct_answer", "full_pipeline", "sub_questions"} + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + +def _load_prompt(stem: str) -> str: + """Load a prompt template from the ask/ directory.""" + path = _PROMPT_DIR / f"{stem}.txt" + with open(path, "r", encoding="utf-8") as fh: + return fh.read() + + +def _safe_str(value: Any, default: str = "") -> str: + """Return str(value) or default if value is None/falsy.""" + if value is None: + return default + return str(value) + +def _ebm_query_from_dict(d: dict) -> EBMQuery: + """Build an EBMQuery dataclass from a parsed LLM JSON dict.""" + return EBMQuery( + query_type=_safe_str(d.get("query_type"), "pico"), + patient=_safe_str(d.get("patient")), + primary_focus=_safe_str(d.get("primary_focus")), + outcome=_safe_str(d.get("outcome")), + keywords=d.get("keywords") or [], + comparator=d.get("comparator"), + reference_standard=d.get("reference_standard"), + time_horizon=d.get("time_horizon"), + ) + + +def _pico_from_ebm(ebm: EBMQuery) -> PICOQuery: + """Derive a legacy PICOQuery from an EBMQuery for backward compatibility.""" + return PICOQuery( + patient=ebm.patient, + intervention=ebm.primary_focus, + comparison=_safe_str(ebm.comparator), + outcome=ebm.outcome, + keywords=ebm.keywords, + ) + + +# --------------------------------------------------------------------------- +# AskAgent +# --------------------------------------------------------------------------- class AskAgent(BaseAgent): - """Agent for refining clinical questions into PICO format""" + """ + Agent for triaging clinical questions and structuring them into EBM queries. - def __init__(self, llm, tools: List[Any] = None): - super().__init__(llm=llm, tools=tools or [], agent_type="Ask") - self.prompt_template = self._load_prompt() + Routing logic: + - direct_answer → answer immediately, set should_terminate = True + - sub_questions → decompose, store list, process first sub-question + - full_pipeline → select framework prompt, build EBMQuery + - Diagnosis questions run a two-step diagnostic analysis first + """ - def _load_prompt(self) -> str: - """Load prompt template from file""" - prompt_path = ( - Path(__file__).parent.parent / "config" / "prompts" / "ask_agent.txt" - ) - with open(prompt_path, "r", encoding="utf-8") as f: - return f.read() + def __init__(self, llm, tools: Optional[List[Any]] = None): + super().__init__(llm=llm, tools=tools or [], agent_type="Ask") + # Pre-load all prompt templates at init time to catch missing files early + self._prompts: Dict[str, str] = { + stem: _load_prompt(stem) + for stem in [ + "router", + "direct_answer", + "diag_step1", + "diag_step2", + "ebm_pico", + "ebm_pird", + "ebm_peo", + "ebm_prognosis", + ] + } - def _parse_json(self, content: str) -> dict: - """Parse JSON from LLM response with heuristic error recovery.""" - return robust_parse_json(content) + # ------------------------------------------------------------------ + # Public entry point + # ------------------------------------------------------------------ def execute(self, state: WorkflowState) -> Dict[str, Any]: - """Execute Ask agent to extract PICO from question""" + """ + Execute the Ask agent. + + Returns a dict of state updates. The coordinator merges this into + WorkflowState. + """ question = state["original_question"] + backtrack_context = self._build_backtrack_context(state) + + # ── Step 1: Route ────────────────────────────────────────────── + route_result = self._run_router(question, backtrack_context) + route_type = route_result.get("route_type", "full_pipeline") + if route_type not in _VALID_ROUTE_TYPES: + logger.warning("Unknown route_type '%s', defaulting to full_pipeline", route_type) + route_type = "full_pipeline" + + route_confidence = float(route_result.get("route_confidence", 0.0)) + question_type = route_result.get("question_type") or "Therapy" + if question_type not in _VALID_QUESTION_TYPES: + question_type = "Therapy" + ebm_framework = route_result.get("ebm_framework") or "pico" + + logger.info( + "AskAgent routing: route=%s (conf=%.2f) type=%s framework=%s", + route_type, route_confidence, question_type, ebm_framework, + ) + + # ── Step 2a: Direct answer ───────────────────────────────────── + if route_type == "direct_answer": + return self._handle_direct_answer( + question=question, + question_type=question_type, + routing_rationale=route_result.get("routing_rationale", ""), + route_confidence=route_confidence, + ) - backtrack_context = "" - if state.get("backtrack_reason"): - backtrack_context = f"Previous attempt failed: {state['backtrack_reason']}\nPlease refine the question." + # ── Step 2b: Sub-questions ───────────────────────────────────── + if route_type == "sub_questions": + sub_question_texts: List[str] = route_result.get("sub_question_texts") or [] + if not sub_question_texts: + # Router said sub_questions but gave no list — fall through to full_pipeline + logger.warning("sub_questions route but no sub_question_texts; falling back to full_pipeline") + route_type = "full_pipeline" + else: + return self._handle_sub_questions( + sub_question_texts=sub_question_texts, + question_type=question_type, + ebm_framework=ebm_framework, + route_confidence=route_confidence, + backtrack_context=backtrack_context, + ) - prompt = self.prompt_template.format( - question=question, backtrack_context=backtrack_context + # ── Step 2c: Full pipeline ───────────────────────────────────── + return self._handle_full_pipeline( + question=question, + question_type=question_type, + ebm_framework=ebm_framework, + route_confidence=route_confidence, + backtrack_context=backtrack_context, ) + # ------------------------------------------------------------------ + # Router + # ------------------------------------------------------------------ + + def _run_router(self, question: str, backtrack_context: str) -> dict: + """Call the router prompt and return parsed JSON.""" + prompt = self._prompts["router"].format( + question=question, + backtrack_context=backtrack_context, + ) response = self.llm.invoke(prompt) - pico_dict = self._parse_json(response.content) - - pico_query = PICOQuery( - patient=pico_dict["patient"], - intervention=pico_dict["intervention"], - comparison=pico_dict["comparison"], - outcome=pico_dict["outcome"], - keywords=pico_dict["keywords"], + try: + return robust_parse_json(response.content) + except ValueError as exc: + logger.error("Router JSON parse failed: %s", exc) + return {"route_type": "full_pipeline", "question_type": "Therapy", "ebm_framework": "pico"} + + # ------------------------------------------------------------------ + # Route handlers + # ------------------------------------------------------------------ + + def _handle_direct_answer( + self, + question: str, + question_type: str, + routing_rationale: str, + route_confidence: float, + ) -> Dict[str, Any]: + """Run the direct_answer prompt and return state updates.""" + prompt = self._prompts["direct_answer"].format( + question=question, + question_type=question_type, + routing_rationale=routing_rationale, ) + response = self.llm.invoke(prompt) + try: + answer_dict = robust_parse_json(response.content) + except ValueError as exc: + logger.error("DirectAnswer JSON parse failed: %s", exc) + answer_dict = {"answer": response.content, "requires_pipeline": False} - question_type = pico_dict.get("question_type", "Therapy") - valid_types = {"Therapy", "Diagnosis", "Prognosis", "Harm", "Prevention"} - if question_type not in valid_types: - question_type = "Therapy" + # If the LLM decided mid-answer that a pipeline is needed, honour it + if answer_dict.get("requires_pipeline"): + logger.info("DirectAnswer prompt escalated to full_pipeline") + return self._handle_full_pipeline( + question=question, + question_type=question_type, + ebm_framework="pico", + route_confidence=route_confidence, + backtrack_context="", + ) + + return { + "route_type": "direct_answer", + "route_confidence": route_confidence, + "question_type": question_type, + "direct_answer_output": answer_dict, + "should_terminate": True, + # Keep pico_query / ebm_query as None — not needed for direct answers + } + + def _handle_sub_questions( + self, + sub_question_texts: List[str], + question_type: str, + ebm_framework: str, + route_confidence: float, + backtrack_context: str, + ) -> Dict[str, Any]: + """ + Decompose into sub-questions. + + Each sub-question is structured independently using the appropriate + framework prompt. The results are stored in sub_pico_queries. + The first sub-question is also promoted to ebm_query / pico_query + so the rest of the pipeline can proceed immediately. + """ + sub_queries: List[EBMQuery] = [] + for sub_q in sub_question_texts: + try: + ebm = self._structure_question( + question=sub_q, + question_type=question_type, + ebm_framework=ebm_framework, + backtrack_context=backtrack_context, + ) + sub_queries.append(ebm) + except Exception as exc: # noqa: BLE001 + logger.error("Failed to structure sub-question '%s': %s", sub_q[:60], exc) + + if not sub_queries: + # All sub-questions failed — fall back to full pipeline on original question + logger.warning("All sub-question structuring failed; falling back to full_pipeline") + return self._handle_full_pipeline( + question=sub_question_texts[0] if sub_question_texts else "", + question_type=question_type, + ebm_framework=ebm_framework, + route_confidence=route_confidence, + backtrack_context=backtrack_context, + ) + + first = sub_queries[0] + return { + "route_type": "sub_questions", + "route_confidence": route_confidence, + "question_type": question_type, + "ebm_query": first, + "pico_query": _pico_from_ebm(first), + "sub_pico_queries": sub_queries, + "sub_question_index": 0, + "sub_question_total": len(sub_queries), + "should_terminate": False, + } + + def _handle_full_pipeline( + self, + question: str, + question_type: str, + ebm_framework: str, + route_confidence: float, + backtrack_context: str, + ) -> Dict[str, Any]: + """Structure the question into an EBMQuery and return state updates.""" + ebm = self._structure_question( + question=question, + question_type=question_type, + ebm_framework=ebm_framework, + backtrack_context=backtrack_context, + ) + return { + "route_type": "full_pipeline", + "route_confidence": route_confidence, + "question_type": question_type, + "ebm_query": ebm, + "pico_query": _pico_from_ebm(ebm), + "should_terminate": False, + } + + # ------------------------------------------------------------------ + # Question structuring + # ------------------------------------------------------------------ - return {"pico_query": pico_query, "question_type": question_type} + def _structure_question( + self, + question: str, + question_type: str, + ebm_framework: str, + backtrack_context: str, + ) -> EBMQuery: + """ + Run the appropriate framework prompt(s) and return an EBMQuery. + + Diagnosis questions run diag_step1 → diag_step2 before the PIRD prompt. + """ + # Diagnostic two-step pre-processing + diag_step1_output: Optional[dict] = None + if question_type == "Diagnosis" or ebm_framework in ("pird", "diagnostic_reasoning"): + diag_step1_output = self._run_diag_step1(question, backtrack_context) + + # Select framework prompt + framework_key = ebm_framework if ebm_framework in _FRAMEWORK_PROMPT else "pico" + prompt_stem = _FRAMEWORK_PROMPT[framework_key] + + prompt_template = self._prompts[prompt_stem] + + # Build format kwargs — each template uses a subset of these + fmt_kwargs: Dict[str, str] = { + "question": question, + "question_type": question_type, + "backtrack_context": backtrack_context, + "diag_step1_output": str(diag_step1_output) if diag_step1_output else "", + } + + # Safely format — ignore keys the template doesn't use + try: + prompt = prompt_template.format(**fmt_kwargs) + except KeyError: + # Template has extra placeholders we didn't supply — fill with empty string + import string + formatter = string.Formatter() + keys_needed = {fn for _, fn, _, _ in formatter.parse(prompt_template) if fn} + safe_kwargs = {k: fmt_kwargs.get(k, "") for k in keys_needed} + prompt = prompt_template.format(**safe_kwargs) + + response = self.llm.invoke(prompt) + try: + ebm_dict = robust_parse_json(response.content) + except ValueError as exc: + logger.error("Framework prompt JSON parse failed (%s): %s", prompt_stem, exc) + raise + + return _ebm_query_from_dict(ebm_dict) + + def _run_diag_step1(self, question: str, backtrack_context: str) -> dict: + """Run the diagnostic step-1 analysis prompt.""" + prompt = self._prompts["diag_step1"].format( + question=question, + backtrack_context=backtrack_context, + ) + response = self.llm.invoke(prompt) + try: + return robust_parse_json(response.content) + except ValueError as exc: + logger.warning("diag_step1 JSON parse failed: %s", exc) + return {} + + # ------------------------------------------------------------------ + # Utilities + # ------------------------------------------------------------------ + + @staticmethod + def _build_backtrack_context(state: WorkflowState) -> str: + """Build a backtrack context string from state.""" + reason = state.get("backtrack_reason") + if not reason: + return "" + return ( + f"Previous attempt failed with the following reason:\n{reason}\n" + "Please refine the question structuring to address this issue." + ) diff --git a/src/config/prompts/apply_agent.txt b/src/config/prompts/apply_agent.txt index f9f9380..759c0e5 100644 --- a/src/config/prompts/apply_agent.txt +++ b/src/config/prompts/apply_agent.txt @@ -8,11 +8,19 @@ For **Strong**, **Weak**, and **Conditional** recommendations: your recommendati **Input Data:** Original Question: {question} +Structured Query: {query_description} + +Route Type: {route_type} + Evidence Summary: {evidence_summary} Overall Appraisal: {appraisal_summary} +Key Downgrade Factors: {key_downgrade_factors} + +Has Serious Inconsistency Across Studies: {has_serious_inconsistency} + **Previous Attempt Feedback (if any):** {backtrack_context} diff --git a/src/config/prompts/appraise_agent.txt b/src/config/prompts/appraise_agent.txt index 8d99a0d..13ccff3 100644 --- a/src/config/prompts/appraise_agent.txt +++ b/src/config/prompts/appraise_agent.txt @@ -30,6 +30,13 @@ - `GUIDELINE`:临床实践指南(初始等级:Moderate,基于其引用的基础证据质量) - `EXPERT_OPINION`:专家意见、述评、共识声明(无原始数据或系统检索)(初始等级:Very Low) +**included_study_type**(仅适用于 SYSTEMATIC_REVIEW / META_ANALYSIS / NMA) +- `RCT`:该SR/MA主要纳入随机对照试验 +- `OBSERVATIONAL`:该SR/MA主要纳入观察性研究(队列、病例对照等) +- `MIXED`:该SR/MA同时纳入RCT和观察性研究 +- `UNKNOWN`:无法从摘要判断纳入研究类型 +- `NA`:非SR/MA/NMA研究,不适用 + ### 二、降级因素(每项独立判断,基于摘要可推断的信息) **risk_of_bias(偏倚风险)** @@ -59,6 +66,8 @@ ### 三、升级因素(仅适用于 COHORT / CASE_CONTROL 研究) +> **注意:** 若 risk_of_bias = SERIOUS 或 VERY_SERIOUS,升级因素**不生效**(upgrade_blocked_by_bias = true)。 + **large_effect(效应量大)** - `YES`:相对风险 > 2 或 < 0.5,且一致 → **+1级** - `NO`:效应量普通 @@ -69,6 +78,15 @@ - `NO`:无剂量-反应关系 - `NA`:不适用 +**confounding_bias_mitigates(混杂偏倚减弱效应)** +- `YES`:存在未控制的混杂因素,但其方向会使真实效应被低估,即真实效应可能更强 → **+1级** +- `NO`:混杂因素不支持升级 +- `NA`:不适用(RCT、SR/MA 或无相关混杂信息) + +**upgrade_blocked_by_bias**(布尔值): +- `true`:risk_of_bias = SERIOUS 或 VERY_SERIOUS,所有升级因素被阻断,不生效 +- `false`:偏倚风险不严重,升级因素可正常生效 + --- ## 冲突评估 @@ -112,6 +130,7 @@ {{ "evidence_id": 1, "study_type": "RCT | SYSTEMATIC_REVIEW | META_ANALYSIS | NMA | COHORT | CASE_CONTROL | CROSS_SECTIONAL | NARRATIVE_REVIEW | CASE_REPORT | GUIDELINE | EXPERT_OPINION", + "included_study_type": "RCT | OBSERVATIONAL | MIXED | UNKNOWN | NA", "risk_of_bias": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS", "inconsistency": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS | NA", "indirectness": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS", @@ -119,6 +138,8 @@ "publication_bias": "SUSPECTED | UNDETECTED", "large_effect": "YES | NO | NA", "dose_response": "YES | NO | NA", + "confounding_bias_mitigates": "YES | NO | NA", + "upgrade_blocked_by_bias": true, "rationale": "一句话说明主要的降级或升级理由" }} ], diff --git a/src/config/prompts/ask/diag_step1.txt b/src/config/prompts/ask/diag_step1.txt new file mode 100644 index 0000000..29e7000 --- /dev/null +++ b/src/config/prompts/ask/diag_step1.txt @@ -0,0 +1,56 @@ +You are a clinical diagnostic reasoning expert for an Evidence-Based Medicine decision-support system. + +This is **Step 1 of a two-step diagnostic question analysis**. Your task is to identify the clinical context and generate a structured differential diagnosis before formulating the PIRD query. + +--- + +## Input + +Clinical Question: {question} +Backtrack Context (if any): {backtrack_context} + +--- + +## Your Task + +Analyze the diagnostic question and produce: + +1. **Clinical Context Summary**: Identify the patient population, the presenting symptom or sign, and the suspected diagnosis or condition being tested for. + +2. **Differential Diagnosis**: List 3–6 plausible diagnoses that the index test is intended to distinguish between, ordered by clinical likelihood. + +3. **Index Test Identification**: Identify the diagnostic test or procedure being evaluated. + +4. **Reference Standard**: Identify the gold standard test or method used to confirm the diagnosis (e.g., biopsy, culture, imaging, clinical criteria). + +5. **Key Diagnostic Outcomes**: What diagnostic accuracy metrics are clinically relevant here? (sensitivity, specificity, PPV, NPV, LR+, LR-, AUC/ROC) + +--- + +## Language Rules + +- `reasoning`, `clinical_context`, `differential_diagnosis`, `key_outcomes`: Write in the **same language as the input question**. +- `keywords`: **Always English MeSH terms**, regardless of input language. + +--- + +## Output Format + +**Reasoning:** + + +**JSON:** +```json +{{ + "clinical_context": "【Description of patient population and clinical scenario in input language】", + "index_test": "【Name of the diagnostic test being evaluated, in input language】", + "reference_standard": "【Gold standard for confirming diagnosis, in input language】", + "diagnosis_target": "【The condition being diagnosed, in input language】", + "differential_diagnosis": [ + "【Diagnosis 1 in input language】", + "【Diagnosis 2 in input language】" + ], + "key_outcomes": ["sensitivity", "specificity", "PPV", "NPV", "LR+", "LR-"], + "keywords": ["MeSH term 1", "MeSH term 2", "MeSH term 3"] +}} +``` diff --git a/src/config/prompts/ask/diag_step2.txt b/src/config/prompts/ask/diag_step2.txt new file mode 100644 index 0000000..eee4803 --- /dev/null +++ b/src/config/prompts/ask/diag_step2.txt @@ -0,0 +1,75 @@ +You are a clinical diagnostic reasoning expert for an Evidence-Based Medicine decision-support system. + +This is **Step 2 of a two-step diagnostic question analysis**. You have already identified the clinical context, index test, reference standard, and differential diagnosis in Step 1. Now you will formalize this into a structured PIRD query for the literature search. + +--- + +## Input + +Original Question: {question} +Step 1 Analysis: +- Clinical Context: {clinical_context} +- Index Test: {index_test} +- Reference Standard: {reference_standard} +- Diagnosis Target: {diagnosis_target} +- Differential Diagnosis: {differential_diagnosis} +- Key Outcomes: {key_outcomes} +- Step 1 Keywords: {step1_keywords} +Backtrack Context (if any): {backtrack_context} + +--- + +## Your Task + +Using the Step 1 analysis, construct a formal PIRD query: + +**P — Patient**: The specific patient population (age, sex, comorbidities, clinical setting if relevant) +**I — Index Test**: The diagnostic test being evaluated +**R — Reference Standard**: The gold standard used to confirm the diagnosis +**D — Diagnosis Target**: The condition being diagnosed + +Then generate a comprehensive PubMed keyword list covering: +- The patient population (MeSH terms + synonyms) +- The index test (MeSH terms + synonyms + brand names if applicable) +- The reference standard +- The diagnosis target (MeSH terms + synonyms) +- Diagnostic accuracy terms (sensitivity, specificity, ROC, etc.) + +--- + +## Language Rules + +- `patient`, `index_test`, `reference_standard`, `diagnosis_target`: Write in the **same language as the input question**. +- `keywords`: **Always English MeSH terms and synonyms**, regardless of input language. + +--- + +## Output Format + +**Reasoning:** + + +**JSON:** +```json +{{ + "query_type": "pird", + "patient": "【Patient population in input language】", + "primary_focus": "【Index test name in input language】", + "comparator": "【Reference standard in input language】", + "reference_standard": "【Reference standard in input language】", + "outcome": "【Diagnostic accuracy outcomes: sensitivity, specificity, etc. in input language】", + "diagnosis_target": "【Condition being diagnosed in input language】", + "time_horizon": null, + "keywords": [ + "patient MeSH term", + "index test MeSH term", + "index test synonym", + "reference standard MeSH term", + "diagnosis target MeSH term", + "sensitivity", + "specificity", + "diagnostic accuracy", + "ROC curve" + ] +}} +``` diff --git a/src/config/prompts/ask/direct_answer.txt b/src/config/prompts/ask/direct_answer.txt new file mode 100644 index 0000000..5b342e5 --- /dev/null +++ b/src/config/prompts/ask/direct_answer.txt @@ -0,0 +1,54 @@ +You are a clinical knowledge expert for an Evidence-Based Medicine decision-support system. + +The routing system has determined that the incoming question does NOT require a literature search — it can be answered directly from established medical knowledge, clinical guidelines, or pharmacological facts. + +--- + +## Input + +Clinical Question: {question} +Question Type: {question_type} +Routing Rationale: {routing_rationale} + +--- + +## Your Task + +Provide a clear, accurate, and concise direct answer to the question. + +**Guidelines:** +- Answer from established medical knowledge, standard clinical guidelines, or well-accepted pharmacological facts. +- Do NOT fabricate citations or invent study results. +- If the question touches on an area with known guideline variation (e.g., different thresholds in ESC vs ACC/AHA), briefly note the variation. +- If the question is partially outside your confident knowledge, state the limitation explicitly. +- Keep the answer focused and clinically useful — avoid unnecessary padding. + +**Scope boundaries:** +- You MAY answer: definitions, mechanisms, normal ranges, drug classes, standard dosing principles, diagnostic criteria, classification systems, anatomy, physiology. +- You MUST NOT answer: questions requiring current evidence synthesis, comparative effectiveness claims, or prognosis estimates that depend on study data. +- If you realize the question actually requires a literature search, state this clearly in `requires_pipeline` and explain why. + +--- + +## Language Requirement + +All output fields (`answer`, `caveats`, `requires_pipeline_reason`) must be written in the **same language as the input question**. + +--- + +## Output Format + +**Reasoning:** + + +**JSON:** +```json +{{ + "answer": "【Direct answer in input language】", + "answer_basis": "established_knowledge | clinical_guideline | pharmacology | anatomy_physiology | classification_criteria", + "guideline_source": "e.g. WHO 2023, ACC/AHA 2022, or null if not applicable", + "caveats": ["caveat 1 in input language", "caveat 2 in input language"], + "requires_pipeline": false, + "requires_pipeline_reason": "null or explanation if the question actually needs a literature search" +}} +``` diff --git a/src/config/prompts/ask/ebm_peo.txt b/src/config/prompts/ask/ebm_peo.txt new file mode 100644 index 0000000..a0d670b --- /dev/null +++ b/src/config/prompts/ask/ebm_peo.txt @@ -0,0 +1,74 @@ +You are a clinical question structuring expert for an Evidence-Based Medicine decision-support system. + +Your task is to convert an **Epidemiology, Harm, or Etiology** clinical question into a structured PEO query for PubMed literature search. + +The PEO framework is used when there is no active intervention — instead, a population is exposed to a risk factor, environmental condition, or etiological agent, and we want to know the effect on an outcome. + +--- + +## Input + +Clinical Question: {question} +Question Type: {question_type} +Backtrack Context (if any): {backtrack_context} + +--- + +## PEO Framework + +**P — Patient/Population**: The specific population being studied. Include relevant characteristics (age, sex, disease status, clinical setting) if stated or clearly implied. + +**E — Exposure**: The risk factor, etiological agent, environmental condition, or prognostic factor being studied. This is NOT an active intervention — it is something the population is exposed to or has (e.g., "smoking", "obesity", "air pollution", "BRCA1 mutation"). + +**O — Outcome**: The health outcome of interest (e.g., "lung cancer incidence", "cardiovascular mortality", "disease progression", "quality of life"). + +--- + +## Exposure Type Classification + +Classify the exposure: +- `risk_factor`: A modifiable or non-modifiable characteristic associated with increased disease risk (e.g., hypertension, smoking) +- `environmental`: An external environmental agent (e.g., air pollution, radiation, occupational exposure) +- `genetic`: A genetic variant or hereditary factor (e.g., BRCA1, APOE4) +- `behavioral`: A lifestyle or behavioral factor (e.g., physical inactivity, diet) +- `iatrogenic`: A medical treatment or procedure as a potential harm source +- `comorbidity`: A co-existing disease as a prognostic factor + +--- + +## Language Rules + +- `patient`, `primary_focus` (exposure), `outcome`: Write in the **same language as the input question**. +- `keywords`: **Always English MeSH terms and synonyms**, regardless of input language. + +--- + +## Output Format + +**Reasoning:** + + +**JSON:** +```json +{{ + "query_type": "peo", + "patient": "【Patient population in input language】", + "primary_focus": "【Exposure or risk factor in input language】", + "comparator": "【Unexposed population or reference group, in input language, or null if not applicable】", + "outcome": "【Health outcome(s) in input language】", + "exposure_type": "risk_factor | environmental | genetic | behavioral | iatrogenic | comorbidity", + "reference_standard": null, + "time_horizon": "【Time horizon if relevant, e.g. '5-year incidence', or null】", + "keywords": [ + "patient MeSH term", + "patient synonym", + "exposure MeSH term", + "exposure synonym", + "outcome MeSH term", + "outcome synonym", + "epidemiology", + "risk factors", + "cohort studies" + ] +}} +``` diff --git a/src/config/prompts/ask/ebm_pico.txt b/src/config/prompts/ask/ebm_pico.txt new file mode 100644 index 0000000..be37275 --- /dev/null +++ b/src/config/prompts/ask/ebm_pico.txt @@ -0,0 +1,66 @@ +You are a clinical question structuring expert for an Evidence-Based Medicine decision-support system. + +Your task is to convert a **Therapy, Prevention, or Harm** clinical question into a structured PICO query for PubMed literature search. + +--- + +## Input + +Clinical Question: {question} +Question Type: {question_type} +Backtrack Context (if any): {backtrack_context} + +--- + +## PICO Framework + +**P — Patient/Problem**: The specific patient population, disease, or clinical condition. Include relevant characteristics (age, sex, disease stage, comorbidities) if stated or clearly implied. + +**I — Intervention**: The main treatment, drug, procedure, or exposure being evaluated. + +**C — Comparison**: The comparator (alternative treatment, placebo, standard of care, or no treatment). If not explicitly stated, infer the most clinically appropriate comparator. For harm questions, this may be "unexposed population". + +**O — Outcome**: The primary clinical outcome(s) of interest (e.g., mortality, recurrence, adverse events, quality of life, hospitalization). + +--- + +## Question Type Guidance + +- **Therapy**: Focus on efficacy outcomes (mortality, symptom relief, disease control). Comparison is usually placebo, standard of care, or alternative drug. +- **Prevention**: Focus on incidence reduction. Comparison is usually no intervention or placebo. +- **Harm**: Focus on adverse outcomes. Comparison is usually unexposed or alternative exposure. + +--- + +## Language Rules + +- `patient`, `intervention`, `comparator`, `outcome`: Write in the **same language as the input question**. +- `keywords`: **Always English MeSH terms and synonyms**, regardless of input language. PubMed only supports English queries. + +--- + +## Output Format + +**Reasoning:** + + +**JSON:** +```json +{{ + "query_type": "pico", + "patient": "【Patient population in input language】", + "primary_focus": "【Intervention in input language】", + "comparator": "【Comparison in input language, or null if not applicable】", + "outcome": "【Primary outcome(s) in input language】", + "reference_standard": null, + "time_horizon": null, + "keywords": [ + "patient MeSH term", + "patient synonym", + "intervention MeSH term", + "intervention synonym", + "outcome MeSH term", + "outcome synonym" + ] +}} +``` diff --git a/src/config/prompts/ask/ebm_pird.txt b/src/config/prompts/ask/ebm_pird.txt new file mode 100644 index 0000000..450b7ee --- /dev/null +++ b/src/config/prompts/ask/ebm_pird.txt @@ -0,0 +1,75 @@ +You are a clinical question structuring expert for an Evidence-Based Medicine decision-support system. + +Your task is to convert a **Diagnosis** clinical question into a structured PIRD query for PubMed literature search. + +--- + +## Input + +Clinical Question: {question} +Question Type: {question_type} +Step 1 Diagnostic Analysis (if available): {diag_step1_output} +Backtrack Context (if any): {backtrack_context} + +--- + +## PIRD Framework + +**P — Patient**: The specific patient population in whom the test is being applied. Include relevant characteristics (age, sex, disease stage, clinical setting) if stated or clearly implied. + +**I — Index Test**: The diagnostic test, tool, biomarker, or procedure being evaluated for its diagnostic accuracy. + +**R — Reference Standard**: The gold standard method used to confirm or exclude the diagnosis. If not explicitly stated, infer the most appropriate reference standard based on clinical knowledge. + +**D — Diagnosis Target**: The specific disease or condition being diagnosed (the target condition). + +--- + +## Diagnostic Accuracy Outcomes + +Always include diagnostic accuracy metrics as the outcome: +- Sensitivity and specificity +- Positive predictive value (PPV) and negative predictive value (NPV) +- Likelihood ratios (LR+, LR-) +- Area under the ROC curve (AUC/AUROC) +- Diagnostic odds ratio (DOR) + +--- + +## Language Rules + +- `patient`, `primary_focus` (index test), `comparator` (reference standard), `outcome`, `reference_standard`: Write in the **same language as the input question**. +- `keywords`: **Always English MeSH terms and synonyms**, regardless of input language. + +--- + +## Output Format + +**Reasoning:** + + +**JSON:** +```json +{{ + "query_type": "pird", + "patient": "【Patient population in input language】", + "primary_focus": "【Index test name in input language】", + "comparator": "【Reference standard in input language】", + "reference_standard": "【Reference standard in input language】", + "outcome": "【Diagnostic accuracy outcomes in input language: sensitivity, specificity, PPV, NPV, AUC】", + "time_horizon": null, + "keywords": [ + "patient MeSH term", + "patient synonym", + "index test MeSH term", + "index test synonym", + "reference standard MeSH term", + "diagnosis target MeSH term", + "sensitivity", + "specificity", + "diagnostic accuracy", + "ROC curve", + "predictive value of tests" + ] +}} +``` diff --git a/src/config/prompts/ask/ebm_prognosis.txt b/src/config/prompts/ask/ebm_prognosis.txt new file mode 100644 index 0000000..81ef157 --- /dev/null +++ b/src/config/prompts/ask/ebm_prognosis.txt @@ -0,0 +1,76 @@ +You are a clinical question structuring expert for an Evidence-Based Medicine decision-support system. + +Your task is to convert a **Prognosis** clinical question into a structured query for PubMed literature search. + +The prognosis framework is used when the question asks about the natural course of a disease, survival rates, risk factors for adverse outcomes, or long-term outcomes in a defined patient population. + +--- + +## Input + +Clinical Question: {question} +Question Type: {question_type} +Backtrack Context (if any): {backtrack_context} + +--- + +## Prognosis Query Framework + +**P — Patient**: The specific patient population with the disease or condition of interest. Include relevant characteristics (age, sex, disease stage, comorbidities, treatment status) if stated or clearly implied. + +**Prognostic Factor**: The specific factor whose prognostic value is being assessed (e.g., "LVEF < 35%", "KRAS mutation", "age > 65", "elevated troponin"). If the question is about overall prognosis without a specific factor, use the disease itself as the prognostic factor. + +**O — Outcome**: The clinical outcome of interest (e.g., "5-year overall survival", "disease-free survival", "recurrence rate", "hospitalization", "functional decline", "quality of life"). + +**T — Time Horizon**: The time frame over which the outcome is measured (e.g., "30-day mortality", "5-year survival", "10-year recurrence"). If not explicitly stated, infer the most clinically relevant time horizon. + +--- + +## Prognosis Question Subtypes + +Classify the prognosis question: +- `overall_prognosis`: What is the expected course/outcome for patients with this condition? (no specific prognostic factor) +- `prognostic_factor`: Does a specific factor (biomarker, clinical variable, genetic marker) predict outcomes? +- `survival_analysis`: What is the survival rate at a specific time point? +- `risk_stratification`: How do different risk groups differ in outcomes? +- `disease_progression`: How does the disease evolve over time? + +--- + +## Language Rules + +- `patient`, `primary_focus` (prognostic factor), `outcome`: Write in the **same language as the input question**. +- `keywords`: **Always English MeSH terms and synonyms**, regardless of input language. + +--- + +## Output Format + +**Reasoning:** + + +**JSON:** +```json +{{ + "query_type": "prognosis", + "patient": "【Patient population with disease/condition in input language】", + "primary_focus": "【Prognostic factor or disease name if overall prognosis, in input language】", + "comparator": "【Reference group for comparison, e.g. 'patients without the factor', or null】", + "outcome": "【Clinical outcome(s) in input language】", + "prognosis_subtype": "overall_prognosis | prognostic_factor | survival_analysis | risk_stratification | disease_progression", + "reference_standard": null, + "time_horizon": "【Time horizon in input language, e.g. '5年总生存率' or '5-year overall survival', or null if not specified】", + "keywords": [ + "disease MeSH term", + "disease synonym", + "prognostic factor MeSH term", + "prognostic factor synonym", + "outcome MeSH term", + "outcome synonym", + "prognosis", + "survival", + "mortality", + "cohort studies" + ] +}} +``` diff --git a/src/config/prompts/ask/router.txt b/src/config/prompts/ask/router.txt new file mode 100644 index 0000000..79e95b6 --- /dev/null +++ b/src/config/prompts/ask/router.txt @@ -0,0 +1,84 @@ +You are a clinical question triage expert for an Evidence-Based Medicine (EBM) decision-support system. + +Your job is to classify the incoming clinical question into one of three routing categories, and to identify the appropriate EBM query framework. + +--- + +## Input + +Clinical Question: {question} +Backtrack Context (if any): {backtrack_context} + +--- + +## Step 1 — Route Classification + +Classify the question into exactly one of: + +**direct_answer** — The question does NOT require a literature search. Use this when: +- The question is a factual/definitional query answerable from established medical knowledge (e.g., "What is the normal range of serum sodium?", "What does LVEF stand for?") +- The question is about drug dosing, pharmacokinetics, or well-established clinical thresholds +- The question is clearly outside the scope of clinical EBM (e.g., administrative, billing, non-clinical) +- Confidence that a literature search would add no value: > 0.85 + +**full_pipeline** — The question requires a full EBM literature search (Ask → Acquire → Appraise → Apply). Use this when: +- The question involves treatment efficacy, diagnostic accuracy, prognosis, harm, or prevention +- The answer depends on the current state of clinical evidence +- The question has a single, well-defined PICO/PIRD/PEO focus + +**sub_questions** — The question is complex and must be decomposed into 2–4 independent sub-questions, each requiring its own literature search. Use this when: +- The question contains multiple distinct clinical decisions (e.g., "Should I use drug A or B, and what monitoring is needed?") +- The question spans multiple EBM domains (e.g., both diagnosis and treatment) +- A single PICO cannot capture the full scope + +--- + +## Step 2 — Question Type Classification + +Classify into one of: +- `Therapy`: Treatment efficacy, drug comparisons, interventions +- `Diagnosis`: Diagnostic test accuracy, sensitivity/specificity, screening +- `Prognosis`: Disease course, survival, risk factors, long-term outcomes +- `Harm`: Adverse effects, risks, harmful exposures +- `Prevention`: Preventive interventions before disease onset +- `Background`: Factual/definitional questions (use with direct_answer route) +- `Mixed`: Spans multiple types (use with sub_questions route) + +--- + +## Step 3 — EBM Framework Selection (for full_pipeline and sub_questions only) + +Select the most appropriate query framework: +- `pico`: Therapy / Prevention / Harm questions (Patient, Intervention, Comparison, Outcome) +- `pird`: Diagnosis questions (Patient, Index test, Reference standard, Diagnosis target) +- `peo`: Epidemiology / Harm / Prognosis questions without a direct intervention (Patient, Exposure, Outcome) +- `prognosis`: Pure prognosis questions (Patient, Prognostic factor, Outcome, Time horizon) +- `diagnostic_reasoning`: Complex differential diagnosis requiring clinical reasoning steps + +--- + +## Language Rules + +- `reasoning`: Write in the same language as the input question. +- `direct_answer_text`: Write in the same language as the input question (only when route_type = "direct_answer"). +- `sub_question_texts`: Write in the same language as the input question (only when route_type = "sub_questions"). + +--- + +## Output Format + +**Reasoning:** + + +**JSON:** +```json +{{ + "route_type": "direct_answer | full_pipeline | sub_questions", + "route_confidence": 0.0, + "question_type": "Therapy | Diagnosis | Prognosis | Harm | Prevention | Background | Mixed", + "ebm_framework": "pico | pird | peo | prognosis | diagnostic_reasoning | null", + "direct_answer_text": "【Answer in input language】 or null", + "sub_question_texts": ["sub-question 1", "sub-question 2"] or null, + "routing_rationale": "One sentence explaining the routing decision in input language" +}} +``` diff --git a/src/config/prompts/judge/acquire_judge.txt b/src/config/prompts/judge/acquire_judge.txt index 675b4dc..e338b2f 100644 --- a/src/config/prompts/judge/acquire_judge.txt +++ b/src/config/prompts/judge/acquire_judge.txt @@ -1,121 +1,121 @@ # Role -你是一个严格的循证医学(EBM)审计员。你的任务是对Acquire Agent获取的证据进行客观分类判断,**不要打分**,只需判断每个检查点是否达标。 +你是一个严格的EBM审计员。对 Acquire Agent 的输出进行客观分类判断,只输出结构化 JSON,不要打分。 # 核心EBM原则 -**证据质量 ≠ 证据数量**。1篇Cochrane系统评价 > 10篇RCT > 100篇病例报告。 -**基于最佳证据评判**,而非对所有证据取平均。 +证据质量 ≠ 证据数量。1篇Cochrane系统评价 > 10篇RCT > 100篇病例报告。 -# 背景说明 -Acquire Agent通过两步完成证据获取: -1. LLM构建PubMed Boolean检索策略,检索候选文献(最多20篇) -2. LLM使用**Listwise排序策略**:将所有候选文献同时输入,比较后选出Top-K最相关的证据并排序 - -因此,**证据列表已按相关性排序**(`relevance_score`为排名归一化分数:排名第1 → 1.0,依次递减)。评判"最佳证据"时,排在前列的文献是LLM认为最相关的,但你应独立判断这一选择是否合理。 - -# Input Data -## PICO查询 -{pico_query} - -## Acquire Agent 输出(已排序的证据列表) -{stage_output} +# Input +路由类型:{route_type} +结构化查询:{ebm_query} +Acquire Agent 输出(已排序的证据列表):{stage_output} # 预处理:系统错误检测 - -**首先检查输入数据中是否包含 `error` 字段**(如 `"error": "Connection timeout"` 或 `"error": "HTTP 500"`): -- 如果存在 `error` 字段,说明 PubMed API 调用本身失败(网络错误/服务不可用),**与检索词无关**。 -- 此时请**跳过所有审计项目**,直接按下列固定值输出: - - `search_terms_valid` → `"YES"`(检索词本身无误,是 API 层面出错) - - `best_study_type` → `"NONE"`,`best_evidence_answers_pico` → `"NO"` - - `pico_p_match`、`pico_i_match`、`pico_o_match` → 均为 `"NO"` - - `top_selection_appropriate`、`selection_count_appropriate` → 均为 `"YES"` - - `search_exhausted` → `false` - - `reasoning` → 填写"PubMed API调用失败(系统错误),非检索词问题,建议保持原检索词重试" - -# Audit Task - -## 1. 检索有效性审计 - -**search_terms_valid**:检索词构建是否正确? -- `YES`:检索词使用了合适的医学术语,方向正确 -- `NO`:检索词存在根本性错误(无关词汇、严重拼写错误),导致检索方向完全错误 - -## 2. 最佳证据质量审计 -**基于证据列表中质量最高的那篇证据进行判断(不一定是排名第一的)。** - -**best_study_type**:找到的最高层级证据类型是什么? -- `SR_META`:系统评价或Meta分析(证据金字塔顶端) -- `RCT`:随机对照试验 -- `COHORT`:队列研究 -- `CASE_CONTROL`:病例对照研究 -- `CASE_REPORT`:病例报告或专家意见(证据金字塔底端) -- `NONE`:未找到任何可分级的证据 - -**best_evidence_answers_pico**:质量最高的那篇证据是否能直接回答PICO临床问题? -- `YES`:能够直接回答(P、I、O均覆盖) -- `PARTIAL`:部分回答,存在轻微间接性(如人群略有差异) -- `NO`:即使存在文章,也无法回答PICO所问的临床问题 - -## 3. PICO匹配度审计 -**基于证据列表中PICO匹配度最好的那篇证据进行判断。** - -**pico_p_match**:证据中的研究人群是否与PICO的Patient匹配? -- `YES`:精准匹配(相同年龄段、相同疾病状态) -- `PARTIAL`:有轻微差异(如年龄范围略不同),结论可审慎外推 -- `NO`:严重不匹配(如成人证据用于儿科问题,或完全不同的疾病) - -**pico_i_match**:证据中的干预措施是否与PICO的Intervention匹配? -- `YES`:精准匹配(相同干预、相同给药路径) -- `PARTIAL`:有轻微差异(同类药物不同剂量,或相关干预),相关性高 -- `NO`:严重不匹配(完全不同的干预或对照措施) - -**pico_o_match**:证据中报告的结局是否与PICO的Outcome匹配? -- `YES`:报告了临床关心的直接结局指标 -- `PARTIAL`:报告了代理指标或部分相关结局 -- `NO`:未报告任何与PICO Outcome相关的结局 - -## 4. Listwise选择合理性审计 -**评判LLM的选择决策是否合理。** - -**top_selection_appropriate**:排名靠前的文献(排名第1-3位)是否确实是列表中最优的证据选择? -- `YES`:排名前列的文献研究层级高且PICO匹配度好,选择合理 -- `PARTIAL`:总体合理,但有1-2篇明显应优先于当前排名更靠前的文献 -- `NO`:排名顺序明显不合理(如病例报告排在SR/RCT前面),或纳入了明显不相关的文献 - -**selection_count_appropriate**:考虑`total_results`(候选总数)和`selected_count`(选中数),选择数量是否合理? -- `YES`:选择数量与候选质量相符(如高质量检索有5篇相关,选5篇合理) -- `PARTIAL`:数量略多或略少,但整体可接受 -- `NO`:明显不合理——候选中有充分证据但选择极少(≤2篇),或候选质量差仍凑满10篇 - -## 5. 检索穷尽判断 - -**search_exhausted**:检索是否已穷尽,应停止重试? -满足以下**全部**条件时设为 `true`: -1. 检索词构建合理(`search_terms_valid = YES`) -2. 找到的相关证据极少或为0 -3. 该临床问题可能属于罕见病、新兴领域或当前尚无高质量证据的领域 - -若检索词本身有问题,应设为 `false`(应先修复检索词,而非放弃)。 +首先检查输入数据中是否包含 error 字段(如 "error": "Connection timeout"): +如果存在 error 字段,说明 PubMed API 调用本身失败,与检索词无关。 +此时跳过所有审计项,直接输出:search_terms_valid=YES,所有 rubric 填 NA,search_exhausted=false,failures=[],overall_quality=pass。 + +# 一票否决项(Gate) + +## G1. search_terms_valid +检索词方向是否正确,能对应到查询的核心概念? +- YES:检索词方向正确 +- NO:检索词方向完全错误(如问心衰治疗却检索肾功能指标) + +# Rubric 评分项 + +各 route_type 对应的主焦点维度: +- ebm_pico:Intervention +- ebm_pird:Index Test +- ebm_peo:Exposure +- ebm_prognosis:Prognostic Factor + +## R1. keywords_cover_pico_dimensions【Critical,权重3】 +关键词是否覆盖 P + 主焦点维度,且至少含一个可在 MeSH 验证的标准词? +- YES:覆盖 P + 主焦点维度,且含 MeSH 标准词 +- PARTIAL:覆盖了 P 或主焦点之一,但另一维度无对应关键词;或有覆盖但无 MeSH 标准词 +- NO:关键词全部指向同一概念,未覆盖多个维度 + +## R2. primary_focus_match【Critical,权重3】 +基于证据列表中主焦点匹配度最好的那篇证据判断:证据中的核心干预/暴露/测试是否与查询主焦点维度匹配? +- YES:精准匹配 +- PARTIAL:同类方法但有差异(不同剂量/版本),相关性高 +- NO:完全不同的测试/干预/暴露 + +## R3. outcome_match【Critical,权重3】 +基于证据列表中结局匹配度最好的那篇证据判断:证据是否报告了临床关心的结局指标? +- YES:报告了直接结局指标 +- PARTIAL:报告了代理指标或部分相关结局 +- NO:未报告任何相关结局 + +## R4. keywords_have_synonyms【Major,权重2】 +核心概念是否有同义词/变体(如 SGLT2i + empagliflozin + dapagliflozin)? +- YES:有同义词/变体 +- PARTIAL:有部分同义词但不完整 +- NO:无任何同义词扩展,仅有单一术语 + +## R5. keywords_count_sufficient【Major,权重2】 +关键词数量是否充足? +- YES:≥ 5 个 +- PARTIAL:3-4 个 +- NO:≤ 2 个 + +## R6. study_design_matches_route【Major,权重2】 +纳入文献的研究设计是否与 route_type 的优先级匹配? +匹配表: +- ebm_pico:第一优先级=SR/Meta分析(基于RCT),第二=RCT,第三=观察性研究,排除=机制综述/专家意见/病例报告 +- ebm_pird:第一优先级=SR/Meta分析(基于诊断准确性研究),第二=诊断准确性研究(横断面),第三=回顾性诊断研究,排除=机制综述/治疗类RCT +- ebm_peo:第一优先级=SR/Meta分析(基于观察性研究),第二=前瞻性队列,第三=病例对照,排除=RCT/机制综述 +- ebm_prognosis:第一优先级=SR/Meta分析(基于队列研究),第二=前瞻性队列,第三=回顾性队列,排除=机制综述/病例报告 +- YES:有第一优先级文献 +- PARTIAL:有次优先级文献但无第一优先级,或混入少量不匹配设计 +- NO:大量纳入与 route_type 不匹配的研究设计 + +## R7. population_match【Major,权重2】 +基于证据列表中人群匹配度最好的那篇证据判断:研究人群是否与查询 Patient 匹配? +- YES:精准匹配(相同年龄段、相同疾病状态) +- PARTIAL:有轻微差异,结论可审慎外推 +- NO:严重不匹配(成人证据用于儿科;完全不同疾病) + +## R8. top_selection_appropriate【Minor,权重1】 +排名靠前的文献(排名第1-3位)是否确实是列表中最优的证据选择? +- YES:排名前列的文献研究层级高且匹配度好 +- PARTIAL:总体合理,但有个别文献位置不最优 +- NO:排名顺序明显不合理(如病例报告排在SR/RCT前面) + +## R9. selection_count_appropriate【Minor,权重1】 +选取数量是否合理? +- YES:数量与候选质量相符 +- PARTIAL:数量略多或略少,但整体可接受 +- NO:明显不合理(大量高质量候选却只选1-2篇,或质量极差仍凑满10篇) + +## R10. key_sentences_present【Minor,权重1】 +Top 文章的 key_sentences 字段是否有实质内容? +- YES:Top 文章的 key_sentences 非空,RAG 流程正常执行 +- PARTIAL:部分文章 key_sentences 为空(摘要极短导致 chunk 失败) +- NO:所有文章 key_sentences 均为空,RAG 流程可能失败 # Output Format -仅输出以下JSON格式,不要包含任何其他文本: +仅输出以下 JSON,不要包含任何其他文本: ```json {{ - "search_audit": {{ + "gate_results": {{ "search_terms_valid": "YES | NO" }}, - "evidence_audit": {{ - "best_study_type": "SR_META | RCT | COHORT | CASE_CONTROL | CASE_REPORT | NONE", - "best_evidence_answers_pico": "YES | PARTIAL | NO", - "pico_p_match": "YES | PARTIAL | NO", - "pico_i_match": "YES | PARTIAL | NO", - "pico_o_match": "YES | PARTIAL | NO" - }}, - "listwise_audit": {{ + "rubric_results": {{ + "keywords_cover_pico_dimensions": "YES | PARTIAL | NO", + "primary_focus_match": "YES | PARTIAL | NO", + "outcome_match": "YES | PARTIAL | NO", + "keywords_have_synonyms": "YES | PARTIAL | NO", + "keywords_count_sufficient": "YES | PARTIAL | NO", + "study_design_matches_route": "YES | PARTIAL | NO", + "population_match": "YES | PARTIAL | NO", "top_selection_appropriate": "YES | PARTIAL | NO", - "selection_count_appropriate": "YES | PARTIAL | NO" + "selection_count_appropriate": "YES | PARTIAL | NO", + "key_sentences_present": "YES | PARTIAL | NO" }}, "search_exhausted": false, - "reasoning": "一句话说明最佳证据情况、Listwise选择质量及主要问题" + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | gate_fail" }} ``` diff --git a/src/config/prompts/judge/apply_judge.txt b/src/config/prompts/judge/apply_judge.txt index 5fc3252..992b77b 100644 --- a/src/config/prompts/judge/apply_judge.txt +++ b/src/config/prompts/judge/apply_judge.txt @@ -1,85 +1,100 @@ # Role -你是一个严格的循证医学(EBM)审计员。你的任务是对Apply Agent生成的临床推荐进行客观分类判断,**不要打分**,只需判断每个检查点是否达标。 - -# Input Data -## PICO查询 -{pico_query} - -## 证据评价结果(来自Appraise阶段) -{appraisal_results} - -## Apply Agent 输出(临床推荐) -{stage_output} - -# Audit Task - -## 1. 推荐-证据匹配审计 - -**recommendation_based_on_evidence**:推荐是否严格基于本次检索和评价的证据? -- `YES`:推荐完全来源于提供的证据,每个推荐点均可追溯至具体证据 -- `PARTIAL`:推荐大部分基于证据,存在轻微外推(如从相似人群推广),但在合理范围内 -- `NO`:推荐严重超出证据范围,或推荐结论与证据方向相反 - -**uses_external_knowledge**:是否引入了本次证据列表中完全不支持的新主张? -(判断标准:推荐中出现了**无法追溯至任何一条所提供证据**的实质性断言,通常表现为"基于临床经验"、"通常认为"、"一般指南推荐"等绕过证据的说法。) - -注意:以下情况**不属于**外部知识,不应标注 YES: -- 直接引用或改写了证据列表中某篇文献所描述的内容(如疾病的病因、诊断标准),即使该内容也存在于模型训练数据中 -- 对证据内容的合理临床解读或推论(如从诊断标准推导出诊断建议) -- 说明证据局限性(如"仅有Very Low证据"、"缺乏RCT") - -- `YES`:推荐引入了证据列表完全不支持的实质性新主张(如推荐了未在任何证据中提及的特定疗法) -- `NO`:推荐内容均可追溯至所提供的证据,或属于对证据的合理临床解读 - -## 2. 推荐强度合理性审计 - -**insufficient_evidence_appropriate**:如果输出了"证据不足"/"Insufficient Evidence"/"No Recommendation",该判断是否正确? -- `YES`:证据确实不足(如仅有Very Low质量证据,或无相关证据),该输出是正确的 -- `NO`:证据足够支撑推荐,但错误输出了"证据不足" -- `NA`:给出了明确的推荐(包括 Strong/Weak/Conditional/Consensus-based,非"证据不足"类输出) - -**strength_matches_evidence_quality**:(仅当 `insufficient_evidence_appropriate = NA` 时判断)推荐强度是否与证据质量匹配? -EBM原则: -- Strong推荐需要High/Moderate直接证据; -- Weak推荐适用于Low质量证据或结果不一致; -- Conditional推荐适用于仅有间接证据(不同人群、替代终点、相似干预); -- Consensus-based推荐适用于仅有专家共识/指南,无直接研究证据; -- Very Low证据且不一致 → 只能支持Weak/Conditional/Consensus-based或证据不足声明。 -- `YES`:推荐强度与证据质量严格匹配(含Conditional/Consensus-based使用正确) -- `MINOR_MISMATCH`:有轻微偏差(如Moderate证据给出Strong,但结果高度一致),临床上可接受 -- `MAJOR_MISMATCH`:严重不匹配(如Very Low/Low证据给出Strong推荐,或有直接高质量证据却用Conditional) - -## 3. 临床可操作性审计 - -**recommendation_specific**:推荐内容是否足够具体、可执行? -- `YES`:推荐包含关键细节(适应症、给药方式、关键参数等),临床医生可直接执行 -- `PARTIAL`:推荐基本明确,但缺少部分关键执行细节 -- `NO`:推荐过于模糊笼统,临床医生无法据此做出决策 - -**caveats_documented**:重要的适用性限制是否在caveats中说明? -(如人群限制、证据间接性、PICO不匹配、特殊禁忌等) -- `YES`:主要限制均已清晰说明 -- `PARTIAL`:说明了部分限制,有重要遗漏 -- `NO`:存在重要限制但完全未说明 -- `NA`:证据与PICO直接适用,无需特别说明 +你是一个严格的EBM审计员。对 Apply Agent 生成的临床推荐进行客观分类判断,只输出结构化 JSON,不要打分。 + +# Input +路由类型:{route_type} +结构化查询:{query_description} +证据评价结果(来自Appraise阶段):{appraisal_results} +Apply Agent 输出(临床推荐):{stage_output} + +# 一票否决项(Gate) + +## G1. recommendation_grounded_in_evidence +推荐意见是否基于本次检索的证据,方向与证据一致? +- YES:推荐完全来源于提供的证据,方向一致 +- NO:推荐与证据无关或方向相反 + +## G2. route_dimension_consistent +Apply 的维度一致性检查是否使用了与 route_type 匹配的框架? +各 route_type 对应的正确框架: +- ebm_pico:Population / Intervention / Comparator / Outcome +- ebm_pird:Population / Index Test / Reference Standard / Target Condition +- ebm_peo:Population / Exposure / Outcome(无 Comparator) +- ebm_prognosis:Population / Prognostic Factor / Outcome / Time Horizon +- YES:维度框架与 route_type 匹配 +- NO:使用了错误框架(如 PIRD 问题用 PICO 框架,Index Test 被映射为 Intervention) + +## G3. strength_not_grossly_inflated +推荐强度是否未严重超出证据上限? +- YES:推荐强度在证据支持范围内 +- NO:Very Low 或 Low 证据给出 Strong 推荐,或有充分高质量证据却输出 No Recommendation + +# Rubric 评分项 + +## R1. effect_size_correctly_reported【Critical,权重3】 +效应量、置信区间、GRADE 等级是否被正确转述,无数据失真? +- YES:数值被正确转述,无失真 +- PARTIAL:数值基本正确,有轻微表述偏差但不影响结论方向 +- NO:效应量或 GRADE 等级被错误转述,导致结论方向改变 + +## R2. strength_matches_evidence【Critical,权重3】 +推荐强度是否与证据等级严格匹配? +注意:inconsistency=SERIOUS 时 Moderate→Weak 的降强推荐属正确行为,不应标注为不匹配。 +EBM原则:Strong需要High/Moderate直接证据;Weak适用于Low质量或结果不一致;Conditional适用于仅有间接证据;Consensus-based适用于仅有专家共识/指南。 +- YES:推荐强度与证据等级严格匹配(含上述特殊情况) +- PARTIAL:有轻微偏差(如 Moderate 证据给 Strong,但结果高度一致且无 inconsistency 问题),临床上可接受 +- NO:推荐强度与证据等级明显不符(不触发 gate 的中等程度不匹配) + +## R3. population_applicability_addressed【Major,权重2】 +是否明确说明了证据人群与当前患者的匹配程度,包括可外推性或外推限制? +- YES:明确说明了人群匹配程度和外推性 +- PARTIAL:有提及人群差异但说明不充分 +- NO:完全未讨论人群适配性 + +## R4. uncertainty_source_explained【Major,权重2】 +不确定性的来源是否被明确说明(如样本量不足、间接证据、研究设计局限)? +- YES:不确定性来源被明确说明 +- PARTIAL:提及了不确定性但未说明来源 +- NO:未提及不确定性,或仅说"证据有限"而无来源说明 + +## R5. citation_traceable【Major,权重2】 +推荐依据是否有文献溯源(PMID 或标题可追溯)? +- YES:推荐依据有文献溯源 +- PARTIAL:部分推荐有溯源,部分缺失 +- NO:无任何文献溯源 + +## R6. recommendation_specific【Minor,权重1】 +推荐内容是否足够具体,临床医生可据此执行(含适应症、关键参数等)? +- YES:推荐包含关键细节,临床医生可直接执行 +- PARTIAL:推荐方向明确但缺少关键细节 +- NO:推荐过于模糊,无法指导临床决策 + +## R7. patient_preference_considered【Minor,权重1】 +患者偏好或价值观是否被纳入推荐表述(或明确说明不适用)? +- YES:患者偏好被纳入,或明确说明不适用 +- PARTIAL:有提及但表述笼统 +- NO:完全未提及患者偏好 # Output Format -仅输出以下JSON格式,不要包含任何其他文本: +仅输出以下 JSON,不要包含任何其他文本: ```json {{ - "grounding_audit": {{ - "recommendation_based_on_evidence": "YES | PARTIAL | NO", - "uses_external_knowledge": "YES | NO" - }}, - "strength_audit": {{ - "insufficient_evidence_appropriate": "YES | NO | NA", - "strength_matches_evidence_quality": "YES | MINOR_MISMATCH | MAJOR_MISMATCH" + "gate_results": {{ + "recommendation_grounded_in_evidence": "YES | NO", + "route_dimension_consistent": "YES | NO", + "strength_not_grossly_inflated": "YES | NO" }}, - "actionability_audit": {{ + "rubric_results": {{ + "effect_size_correctly_reported": "YES | PARTIAL | NO", + "strength_matches_evidence": "YES | PARTIAL | NO", + "population_applicability_addressed": "YES | PARTIAL | NO", + "uncertainty_source_explained": "YES | PARTIAL | NO", + "citation_traceable": "YES | PARTIAL | NO", "recommendation_specific": "YES | PARTIAL | NO", - "caveats_documented": "YES | PARTIAL | NO | NA" + "patient_preference_considered": "YES | PARTIAL | NO" }}, - "reasoning": "一句话说明推荐质量的主要特征及核心问题" + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | gate_fail" }} ``` diff --git a/src/config/prompts/judge/appraise_judge.txt b/src/config/prompts/judge/appraise_judge.txt index 5c20140..f80e48d 100644 --- a/src/config/prompts/judge/appraise_judge.txt +++ b/src/config/prompts/judge/appraise_judge.txt @@ -1,84 +1,94 @@ # Role -你是一个严格的循证医学(EBM)审计员。你的任务是对Appraise Agent的GRADE评价进行客观分类判断,**不要打分**,只需判断每个检查点是否达标。 +你是一个严格的EBM审计员。对 Appraise Agent 的GRADE评价进行客观分类判断,只输出结构化 JSON,不要打分。 # 背景说明 -Appraise Agent输出的是结构化的GRADE分类标签(study_type、risk_of_bias等),**最终GRADE等级由系统代码根据这些标签自动计算**。你的审计重点是: +Appraise Agent 输出结构化的GRADE分类标签(study_type、risk_of_bias等),最终GRADE等级由系统代码根据这些标签自动计算。你的审计重点是: 1. LLM对研究类型(study_type)的识别是否正确 2. 各降级/升级因素的分类是否合理 3. 系统计算出的GRADE等级(computed_grade)是否与你的独立判断一致 -4. 冲突识别是否准确 -# Input Data -## 证据列表 -{evidence_list} - -## Appraise Agent 输出(包含分类标签和计算结果) -{stage_output} - -# Audit Task - -## 1. GRADE因素分类审计 - -**study_type_correct**:Appraise Agent对研究类型(study_type)的识别是否准确? -- `YES`:所有研究的study_type识别正确(RCT/COHORT/CASE_CONTROL/CASE_REPORT) -- `PARTIAL`:大部分正确,个别研究类型有可商榷之处 -- `NO`:存在明显错误(如将观察性研究标记为RCT,或将RCT标记为COHORT) - -**downgrade_factors_appropriate**:各降级因素(risk_of_bias、inconsistency、indirectness、imprecision)的分类是否合理? -- `YES`:各因素的严重程度标签(NOT_SERIOUS/SERIOUS/VERY_SERIOUS)与摘要信息相符 -- `PARTIAL`:整体合理,但个别因素评估过于宽松或严苛 -- `NO`:降级因素评估存在明显错误(如将未盲法RCT标记为NOT_SERIOUS偏倚风险) - -**computed_grade_reasonable**:系统根据分类计算出的最终GRADE等级(computed_grade)是否合理? -- `YES`:计算结果与基于摘要的独立判断一致 -- `PARTIAL`:整体合理,个别研究的等级有轻微偏差 -- `NO`:计算结果明显不合理(通常是因为study_type或降级因素分类错误导致) - -## 2. 证据冲突审计 - -**conflicts_exist**:证据间是否存在实质性冲突? -(结论方向相反:部分研究显示有效/有益,另一些显示无效/有害) -- `YES`:存在实质性、方向性冲突 -- `NO`:各研究结论方向一致(数量差异不算冲突) - -**conflicts_identified**:(仅当 `conflicts_exist = YES` 时判断)冲突是否被正确识别和分析? -- `YES`:所有主要冲突均被识别,conflict_description描述准确 -- `PARTIAL`:识别了主要冲突,但有遗漏或描述不够深入 -- `NO`:存在明显冲突但未被识别(conflicts_exist错误标记为NO) -- `NA`:`conflicts_exist = NO`,此项不适用 - -## 3. 数值数据审计 - -**numerical_data_extracted**:是否合理评估了摘要中可用的数值数据? -- `YES`:data_available的判断准确,能识别摘要中存在的数值指标 -- `PARTIAL`:判断基本合理,有轻微偏差 -- `NO`:对数值数据的评估明显错误(如摘要有明确效应量但标记为NO) -- `NA`:研究类型本身不涉及数值结果 - -**confidence_level_appropriate**:数值提取的置信度(confidence_level)评估是否合理? -- `HIGH`:置信度标签与实际数据可靠性高度吻合 -- `MODERATE`:置信度判断基本合理(仅用摘要时MODERATE属正常范围) -- `LOW`:置信度标签明显过高(实际数值不可靠但标记为HIGH/MODERATE) -- `VERY_LOW`:置信度标签严重偏离实际情况 +# Input +证据列表:{evidence_list} +Appraise Agent 输出(包含分类标签和计算结果):{stage_output} + +# 一票否决项(Gate) + +## G1. study_type_correct +所有研究的 study_type 识别是否正确? +- YES:所有研究的 study_type 识别正确 +- NO:存在明显错误(如将观察性研究标记为RCT) + +## G2. computed_grade_reasonable +系统计算出的最终GRADE等级(computed_grade)是否合理? +- YES:计算结果与基于摘要的独立判断一致 +- NO:明显不合理(通常是 study_type 或降级因素错误导致) + +注意:以下情况属于合理结果,不应判断为 NO: +- SR/MA 纳入观察性研究(included_study_type=OBSERVATIONAL)→ 初始分为 Low,即使无降级因素也可能输出 Low/Very Low +- COHORT/CASE_CONTROL 存在 SERIOUS 偏倚时,即使 large_effect=YES 也不升级 +- CROSS_SECTIONAL 无升级因素 → 最高只能到 Low + +# Rubric 评分项 + +## R1. downgrade_factors_appropriate【Critical,权重3】 +四个降级因素(risk_of_bias/inconsistency/indirectness/imprecision)的严重程度标注是否与摘要信息相符? +- YES:各因素的严重程度标签(NOT_SERIOUS/SERIOUS/VERY_SERIOUS)与摘要信息相符 +- PARTIAL:整体合理,但个别因素评估过于宽松或严苛 +- NO:存在明显错误(如未盲法 RCT 标记为 NOT_SERIOUS 偏倚风险) + +## R2. included_study_type_correct【Critical,权重3】 +(仅当证据列表含 SYSTEMATIC_REVIEW/META_ANALYSIS/NMA 时判断,否则填 NA) +SR/MA/NMA 的 included_study_type 字段是否与摘要描述的纳入研究类型相符? +- YES:字段与摘要描述的纳入研究类型相符(如摘要明确描述"纳入RCT"→ RCT) +- PARTIAL:摘要信息不足以确认(如摘要未描述纳入类型 → UNKNOWN 是合理选择) +- NO:明显错误(如摘要写"仅纳入RCT"但标注为 OBSERVATIONAL) +- NA:证据列表中没有 SR/MA/NMA 类型研究 + +## R3. upgrade_factors_appropriate【Major,权重2】 +(仅当证据列表含 COHORT/CASE_CONTROL 时判断,否则填 NA) +升级因素(large_effect/dose_response/confounding_bias_mitigates)的标注是否与摘要信息相符? +- YES:升级因素的 YES/NO 标注与摘要信息相符 +- PARTIAL:整体合理,个别因素有轻微偏差 +- NO:明显错误(如无明确剂量效应数据但标注 dose_response=YES) +- NA:证据列表中没有 COHORT/CASE_CONTROL 研究 + +## R4. upgrade_blocked_appropriate【Major,权重2】 +(仅当含 COHORT/CASE_CONTROL 且 risk_of_bias=SERIOUS/VERY_SERIOUS 时判断,否则填 NA) +存在严重偏倚风险时,升级因素是否被正确阻断(upgrade_blocked_by_bias=True)? +- YES:risk_of_bias=SERIOUS/VERY_SERIOUS 时,upgrade_blocked_by_bias 正确标注为 True,且最终等级未因升级因素提升 +- NO:存在严重偏倚但升级因素仍被计入 +- NA:无 COHORT/CASE_CONTROL 研究,或 risk_of_bias 均为 NOT_SERIOUS + +## R5. conflicts_identified【Major,权重2】 +证据间存在实质性冲突时,冲突是否被正确识别并描述? +- YES:所有主要冲突均被识别,conflict_description 描述准确;或证据间无冲突(正确标记为无冲突) +- PARTIAL:识别了主要冲突,但有遗漏或描述不够深入 +- NO:存在明显冲突但完全未识别 + +## R6. numerical_data_extracted【Minor,权重1】 +摘要中存在效应量/CI/P值时,是否均被提取? +- YES:data_available 的判断准确,能识别摘要中存在的数值指标 +- PARTIAL:判断基本合理,有轻微偏差 +- NO:摘要有明确效应量但标记为未提取 # Output Format -仅输出以下JSON格式,不要包含任何其他文本: +仅输出以下 JSON,不要包含任何其他文本: ```json {{ - "grade_audit": {{ - "study_type_correct": "YES | PARTIAL | NO", - "downgrade_factors_appropriate": "YES | PARTIAL | NO", - "computed_grade_reasonable": "YES | PARTIAL | NO" - }}, - "conflict_audit": {{ - "conflicts_exist": "YES | NO", - "conflicts_identified": "YES | PARTIAL | NO | NA" + "gate_results": {{ + "study_type_correct": "YES | NO", + "computed_grade_reasonable": "YES | NO" }}, - "data_audit": {{ - "numerical_data_extracted": "YES | PARTIAL | NO | NA", - "confidence_level_appropriate": "HIGH | MODERATE | LOW | VERY_LOW" + "rubric_results": {{ + "downgrade_factors_appropriate": "YES | PARTIAL | NO", + "included_study_type_correct": "YES | PARTIAL | NO | NA", + "upgrade_factors_appropriate": "YES | PARTIAL | NO | NA", + "upgrade_blocked_appropriate": "YES | NO | NA", + "conflicts_identified": "YES | PARTIAL | NO", + "numerical_data_extracted": "YES | PARTIAL | NO" }}, - "reasoning": "一句话说明GRADE评价的主要质量特征及核心问题" + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | gate_fail" }} ``` diff --git a/src/config/prompts/judge/ask_judge.txt b/src/config/prompts/judge/ask_judge.txt index 253e010..dfb1510 100644 --- a/src/config/prompts/judge/ask_judge.txt +++ b/src/config/prompts/judge/ask_judge.txt @@ -1,90 +1,71 @@ # Role -你是一个严格的循证医学(EBM)审计员。你的任务是对Ask Agent的输出进行客观的分类判断,**不要打分**,只需判断每个检查点是否达标。 - -# Input Data -## 原始问题 (User Query) -{original_question} - -## Ask Agent 输出 -{stage_output} - -# Audit Task - -## 1. PICO 结构化审计 -判断以下每个PICO要素的提取状态: - -**P (Patient/Population)** -- `YES`:明确提及患者人群特征(年龄、性别、具体疾病状态等) -- `PARTIAL`:提及了但模糊(如仅说"患者"未说具体疾病) -- `NO`:完全缺失 - -**I (Intervention)** -- `YES`:明确提及了核心干预措施或诊断方法 -- `PARTIAL`:提及但描述模糊 -- `NO`:缺失 - -**C (Comparison)** -- `YES`:明确提及对照组 -- `NA`:原问题不涉及对照,无需提及(此为正确,视为通过) -- `NO`:原问题明显暗示需要对照,但未提及 - -**O (Outcome)** -- `YES`:明确提及期望的临床结局 -- `PARTIAL`:提及但模糊 -- `NO`:缺失 - -## 2. 检索策略审计 - -**keywords_english_medical**:`keywords` 数组中的检索词是否全部为英文医学术语? -(注意:此项**仅检查 `keywords` 数组**,不检查 patient/intervention/comparison/outcome 等PICO描述字段——后者允许与输入问题语言一致。PubMed 只支持英文检索,因此关键词必须是英文。) -- `YES`:`keywords` 数组中全部为英文专业术语(含MeSH词) -- `NO`:`keywords` 数组中包含中文或非医学口语化词汇 - -**has_synonyms_or_mesh**:是否包含MeSH词或同义词扩展? -- `YES`:包含同义词或MeSH标准词(如 "heart failure" OR "cardiac failure") -- `NO`:仅有单一关键词,无任何扩展 - -**boolean_logic_valid**:`keywords` 数组中的检索词概念覆盖是否合理,语义上可组合为有效检索策略? -(注意:`keywords` 是概念列表,**不要求数组中出现 AND/OR 字符串**,布尔拼接由后续 Acquire 阶段完成。此项判断的是:概念本身是否互不冗余、覆盖了 PICO 核心要素,且不包含明显无关的词。) -- `YES`:关键词概念合理,覆盖了 P/I/O 核心要素,无明显冗余或无关词 -- `NO`:关键词存在严重问题(如包含大量无关词、仅有1个过于宽泛的词、全部词指向同一个概念) - -## 3. 表述清晰度审计 - -**pico_statement_unambiguous**:PICO表述是否清晰无歧义? -- `YES`:表述明确,任何人读到都能得出相同理解 -- `PARTIAL`:存在轻微歧义,但不影响核心检索方向 -- `NO`:存在严重歧义,难以据此开展检索 - -## 4. 安全性审计 (Critical) - -**intent_distorted**:PICO是否扭曲了原问题的核心意图? -(例:原问题问儿童,PICO却写成人;原问题问治疗效果,PICO却问诊断准确性) -- `YES`:存在严重意图扭曲(此项将直接导致任务失败) -- `NO`:意图基本一致 +你是一个严格的EBM审计员。对 Ask Agent 的输出进行客观分类判断,只输出结构化 JSON,不要打分。 + +# Input +原始问题:{original_question} +路由类型:{route_type} +Ask Agent 输出:{stage_output} + +# 一票否决项(Gate) +以下任一项为 NO 时,整体判定为 gate_fail,无需继续评分。 + +## G1. intent_not_distorted +结构化结果是否忠实反映原问题意图(方向性:人群、问题类型)? +- YES:意图一致 +- NO:方向性错误(问儿童→写成人;问治疗→写诊断) + +## G2. route_correct(仅当 route_type != direct_answer 时判断,否则填 NA) +route_type 与问题类型是否匹配? +- YES:匹配 +- NO:明显错误(如诊断准确性问题路由为 ebm_pico) +- NA:route_type = direct_answer,不适用 + +## G3. nonresearch_classification_correct(仅当 route_type = direct_answer 时判断,否则填 NA) +以下三条触发条件是否全部满足? +1. 问题要求立即操作性指导(动词:如何处理/立即给/紧急处置) +2. 延迟回答会直接危及患者生命安全 +3. 答案来自已有公认标准流程(BLS/ACLS/指南操作章节) +- YES:三条均满足 +- NO:任一条不满足(应重路由到 EBM 流程) +- NA:route_type != direct_answer,不适用 + +# Rubric 评分项(仅适用于 EBM 路由;direct_answer 路由时所有 rubric 填 NA) + +## R1. core_dimensions_present【Critical,权重3】 +P + 主焦点维度(ebm_pico→I;ebm_pird→IndexTest;ebm_peo→Exposure;ebm_prognosis→PF)+ O 是否均有实质内容? +- YES:三个核心维度均有实质内容 +- PARTIAL:三者中有一个描述极度模糊(如 O="outcomes")但方向正确 +- NO:任一核心维度完全缺失或填写错误 + +## R2. secondary_dimensions_present【Major,权重2】 +次要维度(ebm_pico→C;ebm_pird→R;ebm_prognosis→TH;ebm_peo 无次要维度填 NA)是否按路由要求填写?原问题未涉及的填 NA。 +- YES:次要维度填写正确,或原问题未涉及时正确填 NA +- PARTIAL:次要维度有轻微偏差但不影响检索方向 +- NO:次要维度明显错误(如 PIRD 的 R 字段填了干预措施) +- NA:ebm_peo 路由(无次要维度) + +## R3. statement_unambiguous【Minor,权重1】 +结构化表述是否无歧义,可直接用于检索? +- YES:表述明确,无歧义 +- PARTIAL:有轻微歧义但不影响检索方向 +- NO:严重歧义,检索方向不确定 # Output Format -仅输出以下JSON格式,不要包含任何其他文本: +仅输出以下 JSON,不要包含任何其他文本: ```json {{ - "pico_audit": {{ - "P": "YES | PARTIAL | NO", - "I": "YES | PARTIAL | NO", - "C": "YES | NA | NO", - "O": "YES | PARTIAL | NO" - }}, - "search_audit": {{ - "keywords_english_medical": "YES | NO", - "has_synonyms_or_mesh": "YES | NO", - "boolean_logic_valid": "YES | NO" - }}, - "clarity_audit": {{ - "pico_statement_unambiguous": "YES | PARTIAL | NO" + "gate_results": {{ + "intent_not_distorted": "YES | NO", + "route_correct": "YES | NO | NA", + "nonresearch_classification_correct": "YES | NO | NA" }}, - "safety_audit": {{ - "intent_distorted": "YES | NO" + "rubric_results": {{ + "core_dimensions_present": "YES | PARTIAL | NO | NA", + "secondary_dimensions_present": "YES | PARTIAL | NO | NA", + "statement_unambiguous": "YES | PARTIAL | NO | NA" }}, - "reasoning": "一句话说明主要的PARTIAL或NO判断依据,若全部通过则说明整体质量" + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | gate_fail" }} ``` diff --git a/src/config/prompts/judge/assess_judge.txt b/src/config/prompts/judge/assess_judge.txt index 30e8cd6..38078c4 100644 --- a/src/config/prompts/judge/assess_judge.txt +++ b/src/config/prompts/judge/assess_judge.txt @@ -14,6 +14,11 @@ ## Assess Agent 输出 {stage_output} +## 路由信息 +- 路由类型:{route_type} +- 路由置信度:{route_confidence} +- EBM查询描述:{ebm_query_description} + # Audit Task ## 1. 回答完整性审计 @@ -32,8 +37,8 @@ 逐段检查推理链的连接质量。 **ask_to_acquire_link**:Ask阶段的PICO是否有效指导了Acquire阶段的检索? -- `CLEAR`:检索策略直接来源于PICO,关键词与P/I/O要素对应明确 -- `WEAK`:关联存在但不够紧密,检索词覆盖了PICO的主要方面但有跳跃 +- `CLEAR`:检索策略直接来源于PICO,关键词与P/I/O要素对应明确;且检索词覆盖了 {route_type} 对应的关键维度(ebm_pico→P+I+O;ebm_pird→P+IndexTest+RefStd;ebm_peo→P+Exposure+O;ebm_prognosis→P+PrognosticFactor+O) +- `WEAK`:关联存在但不够紧密,检索词覆盖了PICO的主要方面但有跳跃,或遗漏了 {route_type} 特定维度中的某个关键要素 - `BROKEN`:检索策略与PICO脱节,检索了与PICO无关的主题 **acquire_to_appraise_link**:Acquire阶段获取的证据是否被Appraise阶段正确评价? @@ -60,6 +65,11 @@ - `MINOR_ISSUE`:有轻微不一致(如细节措辞差异),不影响核心结论 - `MAJOR_CONTRADICTION`:存在明显矛盾(不同阶段的人群、干预或结论方向不一致) +**route_confidence_noted**:若 route_confidence = low,最终输出是否包含路由不确定性说明? +- `YES`:route_confidence = low 且输出中包含了对问题框架不确定性的说明 +- `NO`:route_confidence = low 但输出中未提及路由不确定性 +- `NA`:route_confidence != low,不适用 + # Output Format 仅输出以下JSON格式,不要包含任何其他文本: @@ -76,7 +86,8 @@ }}, "consistency_audit": {{ "grade_to_strength_consistent": "YES | MINOR_ISSUE | MAJOR_CONTRADICTION", - "no_internal_contradictions": "YES | MINOR_ISSUE | MAJOR_CONTRADICTION" + "no_internal_contradictions": "YES | MINOR_ISSUE | MAJOR_CONTRADICTION", + "route_confidence_noted": "YES | NO | NA" }}, "reasoning": "一句话说明整体推理链质量及主要问题" }} diff --git a/src/coordinator/coordinator.py b/src/coordinator/coordinator.py index 7a61c47..a5bb16a 100644 --- a/src/coordinator/coordinator.py +++ b/src/coordinator/coordinator.py @@ -54,6 +54,14 @@ def initialize_state(self, question: str) -> WorkflowState: remaining_budget=20, soft_gate_signals=[], question_type=None, + # Routing fields (populated by AskAgent) + route_type=None, + route_confidence=None, + direct_answer_output=None, + ebm_query=None, + sub_pico_queries=None, + sub_question_index=None, + sub_question_total=None, ) def execute_agent(self, agent_name: str, state: WorkflowState) -> WorkflowState: @@ -203,6 +211,15 @@ def execute_workflow(self, question: str) -> WorkflowState: # Execute current agent (includes Judge timing inside execute_agent) state = self.execute_agent(current_step, state) + # ── Direct-answer early exit ──────────────────────────────────────── + # If the Ask agent decided the question can be answered directly from + # established knowledge, skip the full pipeline and return immediately. + if current_step == "Ask" and state.get("route_type") == "direct_answer": + print("[ROUTE] direct_answer — skipping full pipeline.") + state["should_terminate"] = True + state["current_step"] = None + break + # Check hard gates first gate_trigger = check_hard_gates(state) if gate_trigger: diff --git a/src/judge/judge_llm.py b/src/judge/judge_llm.py index 795f271..47108d2 100644 --- a/src/judge/judge_llm.py +++ b/src/judge/judge_llm.py @@ -5,38 +5,192 @@ from src.state.schema import Observe, Evaluation, WorkflowState from src.agents.base import robust_parse_json -# Dimension weights per stage (used by Python to compute weighted overall_score) -STAGE_WEIGHTS = { +# Rubric weight definitions per stage. +# Each rubric: (weight, allows_partial) +# Gate items are checked separately in _check_gates() — not listed here. +RUBRIC_WEIGHTS = { "Ask": { - "pico_completeness": 0.35, - "searchability": 0.35, - "clarity": 0.30, + "core_dimensions_present": (3, True), # Critical + "secondary_dimensions_present": (2, True), # Major + "statement_unambiguous": (1, True), # Minor }, "Acquire": { - "evidence_potency": 0.40, - "evidence_hierarchy": 0.30, - "pico_relevance": 0.30, + "keywords_cover_pico_dimensions": (3, True), + "primary_focus_match": (3, True), + "outcome_match": (3, True), + "keywords_have_synonyms": (2, True), + "keywords_count_sufficient": (2, True), + "study_design_matches_route": (2, True), + "population_match": (2, True), + "top_selection_appropriate": (1, True), + "selection_count_appropriate": (1, True), + "key_sentences_present": (1, True), }, "Appraise": { - "grade_reasonableness": 0.40, - "conflict_identification": 0.30, - "numerical_confidence": 0.30, + "downgrade_factors_appropriate": (3, True), + "included_study_type_correct": (3, True), + "upgrade_factors_appropriate": (2, True), + "upgrade_blocked_appropriate": (2, False), # only YES/NO/NA + "conflicts_identified": (2, True), + "numerical_data_extracted": (1, True), }, "Apply": { - "evidence_alignment": 0.40, - "strength_appropriateness": 0.35, - "actionability": 0.25, - }, - "Assess": { - "answer_completeness": 0.35, - "reasoning_chain": 0.35, - "logical_consistency": 0.30, + "effect_size_correctly_reported": (3, True), + "strength_matches_evidence": (3, True), + "population_applicability_addressed": (2, True), + "uncertainty_source_explained": (2, True), + "citation_traceable": (2, True), + "recommendation_specific": (1, True), + "patient_preference_considered": (1, True), }, } +# Legacy weights kept for Assess stage (unchanged) +_ASSESS_WEIGHTS = { + "answer_completeness": 0.35, + "reasoning_chain": 0.35, + "logical_consistency": 0.30, +} + PASS_THRESHOLD = 0.7 +# --------------------------------------------------------------------------- +# Gate + Rubric helpers (shared across stages) +# --------------------------------------------------------------------------- + + +def _check_gates(stage: str, audit: Dict) -> List[str]: + """ + Check gate items for a stage. Returns list of failed gate names. + Any gate failure means overall fail regardless of rubric scores. + """ + gate_results = audit.get("gate_results", {}) + failed: List[str] = [] + + if stage == "Ask": + if gate_results.get("intent_not_distorted") == "NO": + failed.append("intent_not_distorted") + if gate_results.get("route_correct") == "NO": + failed.append("route_correct") + if gate_results.get("nonresearch_classification_correct") == "NO": + failed.append("nonresearch_classification_correct") + + elif stage == "Acquire": + if gate_results.get("search_terms_valid") == "NO": + failed.append("search_terms_valid") + + elif stage == "Appraise": + if gate_results.get("study_type_correct") == "NO": + failed.append("study_type_correct") + if gate_results.get("computed_grade_reasonable") == "NO": + failed.append("computed_grade_reasonable") + + elif stage == "Apply": + if gate_results.get("recommendation_grounded_in_evidence") == "NO": + failed.append("recommendation_grounded_in_evidence") + if gate_results.get("route_dimension_consistent") == "NO": + failed.append("route_dimension_consistent") + if gate_results.get("strength_not_grossly_inflated") == "NO": + failed.append("strength_not_grossly_inflated") + + return failed + + +def _score_rubrics(stage: str, audit: Dict) -> Tuple[Dict[str, Any], List[Dict], float]: + """ + Score rubric items using the weighted rubric system. + Returns (dimension_scores, raw_issues, overall_score). + NA items are excluded from the denominator. + YES = full weight, PARTIAL = weight * 0.5, NO = 0. + """ + rubric_weights = RUBRIC_WEIGHTS.get(stage, {}) + rubric_results = audit.get("rubric_results", {}) + issues: List[Dict] = [] + total_score = 0.0 + total_max = 0.0 + dimension_scores: Dict[str, Any] = {} + + for rubric_name, (weight, allows_partial) in rubric_weights.items(): + val = rubric_results.get(rubric_name, "NA") + if val == "NA": + dimension_scores[rubric_name] = None # excluded from denominator + continue + + if val == "YES": + score = float(weight) + elif val == "PARTIAL" and allows_partial: + score = weight * 0.5 + else: # NO or PARTIAL on a non-partial rubric + score = 0.0 + + total_score += score + total_max += weight + dimension_scores[rubric_name] = score / weight # normalise to 0-1 for display + + if val == "NO": + severity = "critical" if weight == 3 else "major" if weight == 2 else "minor" + issues.append({ + "severity": severity, + "dimension": rubric_name, + "description": f"{rubric_name} 未通过(NO)", + }) + elif val == "PARTIAL": + severity = "major" if weight >= 2 else "minor" + issues.append({ + "severity": severity, + "dimension": rubric_name, + "description": f"{rubric_name} 部分通过(PARTIAL)", + }) + + overall = total_score / total_max if total_max > 0 else 1.0 + return dimension_scores, issues, overall + + +def _appraise_layer1_check(output: Dict) -> Dict: + """ + Layer 1 Python hardcoded validation for Appraise stage. + Returns dict with keys: passed (bool), failures (list[str]). + If passed=True, skip LLM Judge entirely. + Raises SystemError if grade_output_in_legal_range fails. + """ + LEGAL_GRADES = {"High", "Moderate", "Low", "Very Low"} + LEGAL_STUDY_TYPES = { + "RCT", "COHORT", "CASE_CONTROL", "CASE_REPORT", + "SYSTEMATIC_REVIEW", "META_ANALYSIS", "NMA", + "GUIDELINE", "CROSS_SECTIONAL", "NARRATIVE_REVIEW", "EXPERT_OPINION", + } + failures: List[str] = [] + + appraisal = output.get("appraisal_results") + if appraisal is None: + failures.append("appraisal_results missing") + return {"passed": False, "failures": failures} + + appraisal_d = asdict(appraisal) if is_dataclass(appraisal) else appraisal + evidence_list = appraisal_d.get("evidence", []) if isinstance(appraisal_d, dict) else [] + + for ev in evidence_list: + study_type = ev.get("study_type") + if not study_type or study_type not in LEGAL_STUDY_TYPES: + failures.append( + f"study_type missing or illegal: pmid={ev.get('pmid', '?')} study_type={study_type}" + ) + + rob = ev.get("risk_of_bias") + if rob is None: + failures.append(f"risk_of_bias missing: pmid={ev.get('pmid', '?')}") + + grade = ev.get("grade_level") + if grade and grade not in LEGAL_GRADES: + raise SystemError( + f"grade_output_in_legal_range FAILED: pmid={ev.get('pmid', '?')} grade={grade}. " + "Illegal grade value — workflow terminated." + ) + + return {"passed": len(failures) == 0, "failures": failures} + + # --------------------------------------------------------------------------- # Per-stage Python scoring functions # Each function takes the LLM audit dict and returns: @@ -44,8 +198,29 @@ # --------------------------------------------------------------------------- -def _score_ask(audit: Dict) -> Tuple[Dict[str, float], List[Dict], bool, str]: - """Convert Ask audit classifications to dimension scores and issues.""" +def _score_ask(audit: Dict) -> Tuple[Dict[str, Any], List[Dict], bool, str]: + """Gate + Rubric scoring for Ask stage.""" + gate_failures = _check_gates("Ask", audit) + if gate_failures: + issues = [ + {"severity": "critical", "dimension": g, "description": f"Gate 失败: {g}"} + for g in gate_failures + ] + return {"core_dimensions_present": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" + + # direct_answer: gate passed means classification correct → terminate signal + gate_results = audit.get("gate_results", {}) + if gate_results.get("nonresearch_classification_correct") == "YES": + return {"nonresearch": 1.0}, [], False, "direct_answer路由正确,触发terminate" + + dim_scores, issues, overall = _score_rubrics("Ask", audit) + failures = audit.get("failures", []) + hint = "; ".join(failures) if failures else f"综合评分: {overall:.2f}" + return dim_scores, issues, False, hint + + +def _score_ask_legacy(audit: Dict) -> Tuple[Dict[str, float], List[Dict], bool, str]: + """Legacy Ask scoring — kept for backward compat with old prompt format.""" issues: List[Dict] = [] # --- Safety circuit breaker --- diff --git a/src/state/schema.py b/src/state/schema.py index 6094f27..84a62e4 100644 --- a/src/state/schema.py +++ b/src/state/schema.py @@ -14,6 +14,20 @@ class PICOQuery: keywords: List[str] +@dataclass +class EBMQuery: + """Structured clinical question supporting multiple EBM query frameworks""" + + query_type: str # "pico" | "pird" | "peo" | "prognosis" | "diagnostic_reasoning" + patient: str + primary_focus: str # intervention / index_test / exposure / prognostic_factor + outcome: str + keywords: List[str] + comparator: Optional[str] = None # comparison / reference_standard (PICO/PIRD) + reference_standard: Optional[str] = None # gold standard for diagnostic questions + time_horizon: Optional[str] = None # relevant for prognosis questions + + @dataclass class Evidence: """Single piece of evidence""" @@ -29,6 +43,7 @@ class Evidence: pmcid: Optional[str] = None # PMC article ID (local DB only) full_text: Optional[str] = None # Full text (local DB only, not passed to prompts) key_sentences: Optional[str] = None # Extracted span(s) relevant to query keywords + has_full_text: bool = False # True when full_text field is populated @dataclass @@ -163,3 +178,10 @@ class WorkflowState(TypedDict): remaining_budget: int soft_gate_signals: List[str] question_type: Optional[str] + route_type: Optional[str] # "direct_answer" | "full_pipeline" | "sub_questions" + route_confidence: Optional[float] # 0.0-1.0 confidence in routing decision + direct_answer_output: Optional[Dict[str, Any]] # populated when route_type == "direct_answer" + ebm_query: Optional[EBMQuery] # structured query replacing/extending pico_query + sub_pico_queries: Optional[List[EBMQuery]] # decomposed sub-questions + sub_question_index: Optional[int] # current sub-question being processed (0-based) + sub_question_total: Optional[int] # total number of sub-questions diff --git a/src/tools/pubmed_api.py b/src/tools/pubmed_api.py index 2172d54..99a47c9 100644 --- a/src/tools/pubmed_api.py +++ b/src/tools/pubmed_api.py @@ -126,6 +126,85 @@ def fetch_summaries(self, pmids: List[str]) -> dict: response.raise_for_status() return response.json() + def fetch_pmc_ids(self, pmids: List[str]) -> dict: + """Convert PubMed IDs to PMC IDs via elink. + + Returns a dict mapping pmid -> "PMC" for articles that have a PMC + record. PMIDs with no PMC entry are omitted. Failures return {}. + """ + if not pmids: + return {} + + import xml.etree.ElementTree as ET + + url = f"{self.base_url}/elink.fcgi" + params = { + "dbfrom": "pubmed", + "db": "pmc", + "id": ",".join(pmids), + "retmode": "xml", + "email": self.email, + } + try: + response = requests.get(url, params=params, timeout=15) + response.raise_for_status() + root = ET.fromstring(response.content) + except Exception: + return {} + + pmid_to_pmcid: dict = {} + for link_set in root.findall(".//LinkSet"): + pmid_elem = link_set.find(".//IdList/Id") + if pmid_elem is None: + continue + pmid = pmid_elem.text + for link_set_db in link_set.findall(".//LinkSetDb"): + if link_set_db.findtext("DbTo", "") != "pmc": + continue + pmc_id_elem = link_set_db.find(".//Link/Id") + if pmc_id_elem is not None: + pmid_to_pmcid[pmid] = f"PMC{pmc_id_elem.text}" + break # take the first PMC link only + return pmid_to_pmcid + + def fetch_pmc_full_text(self, pmcid: str) -> Optional[str]: + """Fetch full article text from PubMed Central. + + Args: + pmcid: PMC ID string, e.g. "PMC1234567" or bare "1234567". + + Returns: + Extracted plain-text body joined by double newlines, or None if + the article is not available in PMC open-access XML. + """ + import xml.etree.ElementTree as ET + + # efetch wants the numeric ID only — strip any "PMC" prefix + numeric_id = pmcid.lstrip("PMCpmc") + + url = f"{self.base_url}/efetch.fcgi" + params = { + "db": "pmc", + "id": numeric_id, + "retmode": "xml", + "email": self.email, + } + try: + response = requests.get(url, params=params, timeout=30) + response.raise_for_status() + root = ET.fromstring(response.content) + except Exception: + return None + + # PMC XML: →

; collect all

text nodes + paragraphs: List[str] = [] + for elem in root.iter("p"): + text = "".join(elem.itertext()).strip() + if text: + paragraphs.append(text) + + return "\n\n".join(paragraphs) if paragraphs else None + def search_pubmed( query: str, max_results: int = 5, email: str = None @@ -150,12 +229,14 @@ def search_pubmed( if not pmids: return [] - # Fetch summaries and abstracts in parallel — both only need the PMIDs list - with ThreadPoolExecutor(max_workers=2) as executor: + # Fetch summaries, abstracts, and PMC ID mapping in parallel + with ThreadPoolExecutor(max_workers=3) as executor: fut_summaries = executor.submit(client.fetch_summaries, pmids) fut_abstracts = executor.submit(client.fetch_abstracts, pmids) + fut_pmc_ids = executor.submit(client.fetch_pmc_ids, pmids) summaries = fut_summaries.result() abstracts = fut_abstracts.result() + pmc_ids = fut_pmc_ids.result() # {pmid: "PMC"} for open-access articles evidence_list = [] @@ -169,6 +250,7 @@ def search_pubmed( pub_date = article.get("epubdate", "") abstract = abstracts.get(pmid, "") + pmcid = pmc_ids.get(pmid) # None if not in PMC open-access evidence = Evidence( title=article.get("title", "No title"), @@ -179,8 +261,24 @@ def search_pubmed( study_type=None, publication_date=pub_date, grade_level=None, + pmcid=pmcid, + has_full_text=pmcid is not None, ) evidence_list.append(evidence) _save_cache(key, evidence_list) return evidence_list + + +def fetch_pmc_full_text(pmid: str, email: str = None) -> Optional[str]: + """Convenience wrapper: fetch PMC full text for a single PubMed article. + + Looks up the PMC ID for *pmid* first, then fetches the full article body. + Returns None if the article has no PMC open-access record or on any error. + """ + client = PubMedClient(email=email) + pmc_ids = client.fetch_pmc_ids([pmid]) + pmcid = pmc_ids.get(pmid) + if not pmcid: + return None + return client.fetch_pmc_full_text(pmcid) diff --git a/tests/test_appraise_grade.py b/tests/test_appraise_grade.py new file mode 100644 index 0000000..034077a --- /dev/null +++ b/tests/test_appraise_grade.py @@ -0,0 +1,92 @@ +""" +Tests for _compute_grade in appraise_agent.py. + +Task 7 spec: + SR+RCT → High + SR+OBSERVATIONAL → Low + COHORT+SERIOUS+all upgrades → Very Low (upgrade blocked by SERIOUS bias) + COHORT+NOT_SERIOUS+all upgrades → Moderate (cap at min(points, 3)) + CROSS_SECTIONAL+all upgrades → Low (not in _UPGRADE_STUDY_TYPES) +""" +import pytest +from src.agents.appraise_agent import _compute_grade + + +def test_sr_rct_high(): + """SR containing RCTs starts at 4 (High) with no downgrades → High.""" + appraisal = { + "study_type": "SYSTEMATIC_REVIEW", + "included_study_type": "RCT", + "risk_of_bias": "NOT_SERIOUS", + "inconsistency": "NOT_SERIOUS", + "indirectness": "NOT_SERIOUS", + "imprecision": "NOT_SERIOUS", + "publication_bias": "UNDETECTED", + "large_effect": "NA", + "dose_response": "NA", + } + assert _compute_grade(appraisal) == "High" + + +def test_sr_observational_low(): + """SR containing observational studies starts at 2 (Low) with no downgrades → Low.""" + appraisal = { + "study_type": "SYSTEMATIC_REVIEW", + "included_study_type": "OBSERVATIONAL", + "risk_of_bias": "NOT_SERIOUS", + "inconsistency": "NOT_SERIOUS", + "indirectness": "NOT_SERIOUS", + "imprecision": "NOT_SERIOUS", + "publication_bias": "UNDETECTED", + "large_effect": "NA", + "dose_response": "NA", + } + assert _compute_grade(appraisal) == "Low" + + +def test_cohort_serious_bias_upgrade_blocked(): + """COHORT with SERIOUS risk_of_bias: upgrade factors must be blocked → Very Low.""" + appraisal = { + "study_type": "COHORT", + "included_study_type": "NA", + "risk_of_bias": "SERIOUS", # -1 → points = 2-1 = 1 + "inconsistency": "NOT_SERIOUS", + "indirectness": "NOT_SERIOUS", + "imprecision": "NOT_SERIOUS", + "publication_bias": "UNDETECTED", + "large_effect": "YES", # should be blocked + "dose_response": "YES", # should be blocked + } + assert _compute_grade(appraisal) == "Very Low" + + +def test_cohort_not_serious_all_upgrades_capped_moderate(): + """COHORT with NOT_SERIOUS bias + both upgrades: cap at min(points, 3) → Moderate.""" + appraisal = { + "study_type": "COHORT", + "included_study_type": "NA", + "risk_of_bias": "NOT_SERIOUS", # 0 penalty + "inconsistency": "NOT_SERIOUS", + "indirectness": "NOT_SERIOUS", + "imprecision": "NOT_SERIOUS", + "publication_bias": "UNDETECTED", + "large_effect": "YES", # +1 → 3 + "dose_response": "YES", # +1 → 4, but capped at 3 + } + assert _compute_grade(appraisal) == "Moderate" + + +def test_cross_sectional_upgrades_not_applied(): + """CROSS_SECTIONAL is not in _UPGRADE_STUDY_TYPES → upgrades ignored → Low.""" + appraisal = { + "study_type": "CROSS_SECTIONAL", + "included_study_type": "NA", + "risk_of_bias": "NOT_SERIOUS", + "inconsistency": "NOT_SERIOUS", + "indirectness": "NOT_SERIOUS", + "imprecision": "NOT_SERIOUS", + "publication_bias": "UNDETECTED", + "large_effect": "YES", # should be ignored + "dose_response": "YES", # should be ignored + } + assert _compute_grade(appraisal) == "Low" diff --git a/tests/test_integration_routing.py b/tests/test_integration_routing.py new file mode 100644 index 0000000..6d01d58 --- /dev/null +++ b/tests/test_integration_routing.py @@ -0,0 +1,206 @@ +""" +Integration tests for AskAgent routing logic using mock LLM. + +Tests: + 1. direct_answer route → should_terminate=True, direct_answer_output non-empty + 2. ebm_pico route → ebm_query.query_type == "pico", pico_query compat fields present + 3. ebm_pird route → ebm_query.query_type == "pird" + 4. Legacy pico_query compat → pico_query fields accessible after full_pipeline +""" + +import json +import pytest +from unittest.mock import MagicMock +from src.agents.ask_agent import AskAgent +from src.state.schema import WorkflowState + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_llm(*responses: str) -> MagicMock: + """Return a mock LLM that yields responses in order.""" + llm = MagicMock() + side_effects = [MagicMock(content=r) for r in responses] + llm.invoke.side_effect = side_effects + return llm + + +def _base_state(question: str) -> WorkflowState: + return WorkflowState( + original_question=question, + current_step="Ask", + iteration_count=0, + agent_call_counts={}, + pico_query=None, + evidence_list=None, + appraisal_results=None, + recommendation=None, + assessment=None, + gate_triggered=None, + backtrack_reason=None, + should_terminate=False, + execution_history=[], + observe_history=[], + decision_history=[], + backtrack_history=[], + human_intervention_requests=[], + remaining_budget=20, + soft_gate_signals=[], + question_type=None, + route_type=None, + route_confidence=None, + direct_answer_output=None, + ebm_query=None, + sub_pico_queries=None, + sub_question_index=None, + sub_question_total=None, + ) + + +# --------------------------------------------------------------------------- +# 1. direct_answer route +# --------------------------------------------------------------------------- + +def test_direct_answer_route_sets_terminate(): + """direct_answer route → should_terminate=True, direct_answer_output non-empty.""" + router_json = json.dumps({ + "route_type": "direct_answer", + "route_confidence": 0.95, + "question_type": "Therapy", + "ebm_framework": "pico", + "routing_rationale": "Immediate life-threatening situation", + }) + direct_answer_json = json.dumps({ + "answer": "Call 911 immediately and start CPR.", + "requires_pipeline": False, + }) + + llm = _make_llm(router_json, direct_answer_json) + agent = AskAgent(llm=llm) + state = _base_state("Patient is in cardiac arrest, what do I do?") + + result = agent.execute(state) + + assert result["route_type"] == "direct_answer" + assert result["should_terminate"] is True + assert result["direct_answer_output"] is not None + assert result["direct_answer_output"].get("answer") + + +# --------------------------------------------------------------------------- +# 2. ebm_pico route → ebm_query.query_type == "pico" +# --------------------------------------------------------------------------- + +def test_full_pipeline_pico_route(): + """full_pipeline with ebm_pico framework → ebm_query.query_type == 'pico'.""" + router_json = json.dumps({ + "route_type": "full_pipeline", + "route_confidence": 0.9, + "question_type": "Therapy", + "ebm_framework": "pico", + "routing_rationale": "Standard therapy question", + }) + pico_json = json.dumps({ + "query_type": "pico", + "patient": "Adults with type 2 diabetes", + "primary_focus": "SGLT2 inhibitors", + "outcome": "HbA1c reduction", + "keywords": ["SGLT2", "diabetes", "HbA1c"], + "comparator": "placebo", + }) + + llm = _make_llm(router_json, pico_json) + agent = AskAgent(llm=llm) + state = _base_state("Do SGLT2 inhibitors reduce HbA1c in type 2 diabetes?") + + result = agent.execute(state) + + assert result["route_type"] == "full_pipeline" + assert result["should_terminate"] is False + assert result["ebm_query"] is not None + assert result["ebm_query"].query_type == "pico" + # Legacy compat: pico_query must be present with required fields + assert result["pico_query"] is not None + assert result["pico_query"].patient == "Adults with type 2 diabetes" + assert result["pico_query"].intervention == "SGLT2 inhibitors" + + +# --------------------------------------------------------------------------- +# 3. ebm_pird route → ebm_query.query_type == "pird" +# --------------------------------------------------------------------------- + +def test_full_pipeline_pird_route(): + """full_pipeline with ebm_pird framework → ebm_query.query_type == 'pird'. + + Diagnosis questions run diag_step1 before the PIRD prompt, so we need + three LLM responses: router → diag_step1 → pird. + """ + router_json = json.dumps({ + "route_type": "full_pipeline", + "route_confidence": 0.85, + "question_type": "Diagnosis", + "ebm_framework": "pird", + "routing_rationale": "Diagnostic accuracy question", + }) + diag_step1_json = json.dumps({ + "diagnostic_type": "accuracy", + "index_test": "CT pulmonary angiography", + "reference_standard": "V/Q scan", + }) + pird_json = json.dumps({ + "query_type": "pird", + "patient": "Adults with suspected PE", + "primary_focus": "CT pulmonary angiography", + "outcome": "PE diagnosis confirmed", + "keywords": ["CTPA", "pulmonary embolism", "diagnosis"], + "reference_standard": "V/Q scan", + }) + + llm = _make_llm(router_json, diag_step1_json, pird_json) + agent = AskAgent(llm=llm) + state = _base_state("How accurate is CTPA for diagnosing pulmonary embolism?") + + result = agent.execute(state) + + assert result["route_type"] == "full_pipeline" + assert result["ebm_query"] is not None + assert result["ebm_query"].query_type == "pird" + assert result["ebm_query"].reference_standard == "V/Q scan" + + +# --------------------------------------------------------------------------- +# 4. Legacy pico_query compat — pico_query fields accessible after full_pipeline +# --------------------------------------------------------------------------- + +def test_pico_query_compat_fields_present(): + """After full_pipeline, pico_query has all legacy fields (patient, intervention, comparison, outcome, keywords).""" + router_json = json.dumps({ + "route_type": "full_pipeline", + "route_confidence": 0.88, + "question_type": "Therapy", + "ebm_framework": "pico", + }) + pico_json = json.dumps({ + "query_type": "pico", + "patient": "Children with asthma", + "primary_focus": "Inhaled corticosteroids", + "outcome": "Exacerbation rate", + "keywords": ["ICS", "asthma", "children"], + "comparator": "LABA", + }) + + llm = _make_llm(router_json, pico_json) + agent = AskAgent(llm=llm) + state = _base_state("Are inhaled corticosteroids effective in children with asthma?") + + result = agent.execute(state) + + pq = result["pico_query"] + assert pq is not None + assert pq.patient == "Children with asthma" + assert pq.intervention == "Inhaled corticosteroids" + assert pq.comparison == "LABA" + assert pq.outcome == "Exacerbation rate" + assert "ICS" in pq.keywords diff --git a/tests/test_judge_rubrics.py b/tests/test_judge_rubrics.py new file mode 100644 index 0000000..af7c146 --- /dev/null +++ b/tests/test_judge_rubrics.py @@ -0,0 +1,141 @@ +""" +Tests for judge_llm.py Gate + Rubrics scoring system. + +Covers: + 1. Gate failure → _score_ask returns score 0.0 and critical issue + 2. Gate pass + all YES rubrics → score 1.0 + 3. _check_gates("Ask", intent_distorted) → returns failure + 4. _check_gates("Apply", recommendation_not_grounded) → returns failure + 5. All YES rubric_results → _score_rubrics returns overall 1.0 +""" + +import pytest +from src.judge.judge_llm import _check_gates, _score_rubrics, _score_ask, RUBRIC_WEIGHTS + + +# --------------------------------------------------------------------------- +# 1. Gate failure → _score_ask returns 0.0 with critical issue +# --------------------------------------------------------------------------- + +def test_score_ask_gate_failure_returns_zero(): + """When intent_not_distorted gate fails, _score_ask returns 0.0 score.""" + audit = { + "gate_results": { + "intent_not_distorted": "NO", # gate failure + "route_correct": "YES", + "nonresearch_classification_correct": "NA", + }, + "rubric_results": { + "core_dimensions_present": "YES", + "secondary_dimensions_present": "YES", + "statement_unambiguous": "YES", + }, + "failures": ["intent_not_distorted"], + "overall_quality": "gate_fail", + } + dim_scores, issues, search_exhausted, hint = _score_ask(audit) + + # At least one critical issue must be present + assert any(i["severity"] == "critical" for i in issues), "Expected critical issue on gate failure" + # The dimension score for the failed gate should be 0.0 + assert list(dim_scores.values())[0] == 0.0, "Expected 0.0 score on gate failure" + + +# --------------------------------------------------------------------------- +# 2. Gate pass + all YES rubrics → score 1.0 +# --------------------------------------------------------------------------- + +def test_score_ask_all_yes_returns_one(): + """When all gates pass and all rubrics are YES, _score_ask returns overall 1.0.""" + audit = { + "gate_results": { + "intent_not_distorted": "YES", + "route_correct": "YES", + "nonresearch_classification_correct": "NA", + }, + "rubric_results": { + "core_dimensions_present": "YES", + "secondary_dimensions_present": "YES", + "statement_unambiguous": "YES", + }, + "failures": [], + "overall_quality": "pass", + } + dim_scores, issues, search_exhausted, hint = _score_ask(audit) + + assert issues == [], f"Expected no issues, got: {issues}" + # All dimension scores should be 1.0 + for k, v in dim_scores.items(): + if v is not None: + assert v == 1.0, f"Expected 1.0 for {k}, got {v}" + + +# --------------------------------------------------------------------------- +# 3. _check_gates("Ask", intent_distorted=YES) → returns failure list +# --------------------------------------------------------------------------- + +def test_check_gates_ask_intent_distorted(): + """_check_gates returns 'intent_not_distorted' when that gate is NO.""" + audit = { + "gate_results": { + "intent_not_distorted": "NO", + "route_correct": "YES", + "nonresearch_classification_correct": "NA", + } + } + failures = _check_gates("Ask", audit) + assert "intent_not_distorted" in failures + + +# --------------------------------------------------------------------------- +# 4. _check_gates("Apply", recommendation_not_grounded) → returns failure +# --------------------------------------------------------------------------- + +def test_check_gates_apply_not_grounded(): + """_check_gates returns 'recommendation_grounded_in_evidence' when that gate is NO.""" + audit = { + "gate_results": { + "recommendation_grounded_in_evidence": "NO", + "route_dimension_consistent": "YES", + "strength_not_grossly_inflated": "YES", + } + } + failures = _check_gates("Apply", audit) + assert "recommendation_grounded_in_evidence" in failures + + +# --------------------------------------------------------------------------- +# 5. All YES rubric_results → _score_rubrics returns overall 1.0 +# --------------------------------------------------------------------------- + +def test_score_rubrics_all_yes_returns_one(): + """_score_rubrics returns overall score 1.0 when all rubrics are YES.""" + # Build an audit with all Ask rubrics set to YES + rubric_results = {k: "YES" for k in RUBRIC_WEIGHTS["Ask"]} + audit = {"rubric_results": rubric_results} + + dim_scores, issues, overall = _score_rubrics("Ask", audit) + + assert overall == pytest.approx(1.0), f"Expected 1.0, got {overall}" + assert issues == [], f"Expected no issues, got: {issues}" + + +# --------------------------------------------------------------------------- +# Bonus: PARTIAL rubric gives 0.5 weight +# --------------------------------------------------------------------------- + +def test_score_rubrics_partial_gives_half(): + """A PARTIAL rubric result contributes 0.5 × weight to the score.""" + # Only one rubric, set to PARTIAL + audit = { + "rubric_results": { + "core_dimensions_present": "PARTIAL", # weight=3, allows_partial=True + "secondary_dimensions_present": "NA", + "statement_unambiguous": "NA", + } + } + dim_scores, issues, overall = _score_rubrics("Ask", audit) + + # score = 3*0.5 / 3 = 0.5 + assert overall == pytest.approx(0.5), f"Expected 0.5, got {overall}" + assert dim_scores["core_dimensions_present"] == pytest.approx(0.5) From 52e7e596be4b9a5b19df689be2dabe916bc3997b Mon Sep 17 00:00:00 2001 From: Winda0001 <13912795021@163.com> Date: Wed, 20 May 2026 22:38:14 +0800 Subject: [PATCH 3/4] chore: checkpoint current running system before hypertension RAG refactor Snapshot of the current operating mode prior to refactoring evidence acquisition to the hypertensiondb RAG service. - Modified agents, prompts, coordinator, judge: all reflect the 4/20-4/22 redesign output that has been the working system since. - Deleted obsolete plans/specs from 2026-03~04. - Added router_unified.txt (V2 unified Ask router) and tests/agents/ package which were never committed but are functionally part of the running system. - hypertension/ added to .gitignore (separate sibling repo, not a submodule). Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .gitignore | 4 + .../plans/2026-03-16-opensource-quality.md | 660 -------- .../plans/2026-03-25-web-ui-improvements.md | 1192 -------------- .../2026-04-03-repo-usability-improvements.md | 1405 ----------------- .../plans/2026-04-20-22-full-redesign.md | 321 ---- ...2026-04-22-judge-rubrics-implementation.md | 1276 --------------- .../2026-03-16-opensource-quality-design.md | 141 -- ...026-03-20-obstetrics-evidence-db-design.md | 169 -- .../2026-03-25-web-ui-improvements-design.md | 239 --- ...4-03-repo-usability-improvements-design.md | 338 ---- run_ab_test.sh | 0 run_test.sh | 0 scripts/check_env.py | 0 src/agents/acquire_agent.py | 58 +- src/agents/apply_agent.py | 56 +- src/agents/appraise_agent.py | 35 +- src/agents/ask_agent.py | 68 +- src/agents/assess_agent.py | 25 +- src/agents/base.py | 29 + src/config/llm_config.py | 268 +++- src/config/prompts/acquire_agent.txt | 28 +- src/config/prompts/apply_agent.txt | 21 +- src/config/prompts/appraise_agent.txt | 64 +- src/config/prompts/ask/router_unified.txt | 142 ++ src/config/prompts/judge/acquire_judge.txt | 30 +- src/config/prompts/judge/apply_judge.txt | 20 +- src/config/prompts/judge/appraise_judge.txt | 76 +- src/config/prompts/judge/ask_judge.txt | 14 +- src/config/prompts/judge/assess_judge.txt | 35 +- src/config/prompts/scheduling_llm.txt | 74 +- src/coordinator/coordinator.py | 236 ++- src/coordinator/gate_engine.py | 41 - src/judge/judge_llm.py | 742 +++------ src/main.py | 95 +- src/scheduling/scheduling_llm.py | 2 +- src/state/schema.py | 1 + src/tools/pubmed_api.py | 14 + tests/__init__.py | 0 tests/agents/__init__.py | 0 tests/agents/test_acquire_agent.py | 48 + tests/agents/test_apply_agent.py | 46 + tests/agents/test_appraise_agent.py | 40 + tests/agents/test_ask_agent.py | 35 + tests/agents/test_assess_agent.py | 38 + tests/agents/test_base.py | 26 + tests/test_appraise_grade.py | 47 +- tests/test_integration_routing.py | 1 - 47 files changed, 1689 insertions(+), 6511 deletions(-) delete mode 100644 docs/superpowers/plans/2026-03-16-opensource-quality.md delete mode 100644 docs/superpowers/plans/2026-03-25-web-ui-improvements.md delete mode 100644 docs/superpowers/plans/2026-04-03-repo-usability-improvements.md delete mode 100644 docs/superpowers/plans/2026-04-20-22-full-redesign.md delete mode 100644 docs/superpowers/plans/2026-04-22-judge-rubrics-implementation.md delete mode 100644 docs/superpowers/specs/2026-03-16-opensource-quality-design.md delete mode 100644 docs/superpowers/specs/2026-03-20-obstetrics-evidence-db-design.md delete mode 100644 docs/superpowers/specs/2026-03-25-web-ui-improvements-design.md delete mode 100644 docs/superpowers/specs/2026-04-03-repo-usability-improvements-design.md mode change 100755 => 100644 run_ab_test.sh mode change 100755 => 100644 run_test.sh mode change 100755 => 100644 scripts/check_env.py create mode 100644 src/config/prompts/ask/router_unified.txt create mode 100644 tests/__init__.py create mode 100644 tests/agents/__init__.py create mode 100644 tests/agents/test_acquire_agent.py create mode 100644 tests/agents/test_apply_agent.py create mode 100644 tests/agents/test_appraise_agent.py create mode 100644 tests/agents/test_ask_agent.py create mode 100644 tests/agents/test_assess_agent.py create mode 100644 tests/agents/test_base.py diff --git a/.gitignore b/.gitignore index f7fdf35..db1b8df 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,7 @@ logs/ gemini文本生图.py nul .worktrees/ + +# hypertension/ is a separate git repository (its own .git/); +# kept alongside ebm5a as a sibling sub-project, not as a submodule. +hypertension/ diff --git a/docs/superpowers/plans/2026-03-16-opensource-quality.md b/docs/superpowers/plans/2026-03-16-opensource-quality.md deleted file mode 100644 index 93e8d7b..0000000 --- a/docs/superpowers/plans/2026-03-16-opensource-quality.md +++ /dev/null @@ -1,660 +0,0 @@ -# Open-Source Quality Improvements — Implementation Plan - -> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add the scaffolding (LICENSE, CI, contributor docs, packaging, doc structure) that a working codebase needs before it can be treated as a serious open-source project. - -**Architecture:** All changes are purely additive — new files created, a few config files edited, internal dev-note files moved to `docs/internal/`. Zero changes to `src/`. The two operations that touch existing files (`.env.example` sanitisation and `.gitignore` edit) are ordered to prevent any credential exposure. - -**Tech Stack:** Python 3.10, pytest, GitHub Actions, setuptools/pyproject.toml, standard Markdown templates. - ---- - -## Chunk 1: P0 — Legal & Security - -### Task 1: Sanitise `.env.example` - -> ⚠️ This task MUST be completed and committed before Task 2. Reversing the order risks staging real credentials. - -**Files:** -- Edit: `.env.example` - -- [ ] **Step 1: Overwrite `.env.example` with placeholder-only content** - -Replace the entire file with: - -```dotenv -# Copy this file to .env and fill in your values. -# NEVER commit .env — it is gitignored. - -# Required -LLM_BASE_URL=https://api.openai.com/v1 -LLM_API_KEY=your_api_key_here -LLM_MODEL=gpt-4 -PUBMED_EMAIL=your_email@example.com - -# Optional: use a faster/cheaper model for Judge and Scheduling (~30-40% faster) -# FAST_LLM_MODEL=gpt-3.5-turbo -``` - -- [ ] **Step 2: Verify no real credentials remain** - -```bash -grep -E "(sk-|@gmail|huatuo)" .env.example -``` -Expected: no output (zero matches). - -- [ ] **Step 3: Commit sanitised template (still gitignored at this point)** - -```bash -git add .env.example -git commit -m "security: replace real credentials in .env.example with placeholders" -``` - ---- - -### Task 2: Edit `.gitignore` — unblock template and tighten rules - -**Files:** -- Edit: `.gitignore` - -- [ ] **Step 1: Make the following changes to `.gitignore` in a single edit** - - - **Remove** the line: `.env.example` ← allows the sanitised template to be tracked - - **Remove** the line: `QUICKSTART.md` ← README links to this file; it must be visible to cloners - - **Add** the line: `*.log` ← covers any `.log` files written to the project root (the existing `logs/` entry covers the directory but not root-level log files) - - Leave `nul` as-is — it is already present; do not add a duplicate. - -- [ ] **Step 2: Verify `.env.example` is now tracked** - -`.env.example` was already committed in Task 1 — use `git ls-files` (not `git status`) to confirm it is tracked: - -```bash -git ls-files .env.example -``` -Expected: `.env.example` (file appears in the tracked list). - -- [ ] **Step 3: Verify `QUICKSTART.md` is now tracked** - -```bash -git ls-files QUICKSTART.md -``` -Expected: `QUICKSTART.md` (file appears in the tracked list). - -- [ ] **Step 4: Commit** - -Do NOT re-stage `.env.example` — it is unchanged since Task 1's commit. Stage only the files modified in this task: - -```bash -git add .gitignore QUICKSTART.md -git commit -m "chore: unblock .env.example and QUICKSTART.md; add *.log to gitignore" -``` - ---- - -### Task 3: Add `LICENSE` - -**Files:** -- Create: `LICENSE` - -- [ ] **Step 1: Create `LICENSE` with MIT text** - -``` -MIT License - -Copyright (c) 2026 EBM 5A Contributors - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -``` - -- [ ] **Step 2: Verify file exists and starts with "MIT License"** - -```bash -head -1 LICENSE -``` -Expected: `MIT License` - -- [ ] **Step 3: Commit** - -```bash -git add LICENSE -git commit -m "chore: add MIT LICENSE file" -``` - ---- - -## Chunk 2: P1 — Contributor Experience - -### Task 4: Create `requirements-dev.txt` - -**Files:** -- Create: `requirements-dev.txt` - -- [ ] **Step 1: Create the file** - -Pinned versions are copied verbatim from `requirements.txt`, minus `torch` and `transformers` (too heavy for CI): - -``` -langchain==0.1.0 -langchain-openai==0.0.5 -langgraph==0.0.20 -requests==2.31.0 -pytest==7.4.3 -pytest-cov==4.1.0 -pytest-mock==3.12.0 -python-dotenv==1.0.0 -``` - -- [ ] **Step 2: Verify all versions match `requirements.txt`** - -```bash -grep -f <(grep -v "torch\|transformers" requirements.txt | grep "==") requirements-dev.txt -``` -Expected: all lines echo back (every pinned version is present). - -- [ ] **Step 3: Commit** - -```bash -git add requirements-dev.txt -git commit -m "chore: add requirements-dev.txt for CI (excludes heavy torch/transformers)" -``` - ---- - -### Task 5: Create GitHub Actions CI - -**Files:** -- Create: `.github/workflows/ci.yml` - -- [ ] **Step 1: Create `.github/workflows/` directory and `ci.yml`** - -```yaml -name: CI - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - test: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - - name: Cache pip - uses: actions/cache@v4 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }} - restore-keys: | - ${{ runner.os }}-pip- - - - name: Install dependencies - run: pip install -r requirements-dev.txt - - - name: Run tests - run: pytest --tb=short -``` - -- [ ] **Step 2: Verify YAML is valid** - -```bash -python -c "import yaml; yaml.safe_load(open('.github/workflows/ci.yml'))" && echo "YAML OK" -``` -Expected: `YAML OK` - -- [ ] **Step 3: Commit** - -```bash -git add .github/workflows/ci.yml -git commit -m "ci: add GitHub Actions workflow (pytest on push/PR)" -``` - ---- - -### Task 6: Create Issue Templates - -**Files:** -- Create: `.github/ISSUE_TEMPLATE/bug_report.md` -- Create: `.github/ISSUE_TEMPLATE/feature_request.md` - -- [ ] **Step 1: Create `bug_report.md`** - -```markdown ---- -name: Bug report -about: Something is not working as expected -labels: bug ---- - -## Description -A clear description of what the bug is. - -## Steps to Reproduce -1. Run: `python -m src.main "..."` -2. See error - -## Expected Behaviour -What you expected to happen. - -## Actual Behaviour -What actually happened. Include the full error message and traceback if applicable. - -## Environment -- OS: -- Python version (`python --version`): -- LLM provider / model: -- EBM 5A version / commit: -``` - -- [ ] **Step 2: Create `feature_request.md`** - -```markdown ---- -name: Feature request -about: Suggest an improvement or new capability -labels: enhancement ---- - -## Problem Statement -What problem does this feature solve? Who is affected? - -## Proposed Solution -Describe the feature you'd like. - -## Alternatives Considered -What other approaches did you consider, and why did you rule them out? - -## Additional Context -Any other context, references, or screenshots. -``` - -- [ ] **Step 3: Commit** - -```bash -git add .github/ISSUE_TEMPLATE/ -git commit -m "docs: add GitHub issue templates (bug report, feature request)" -``` - ---- - -### Task 7: Create PR Template - -**Files:** -- Create: `.github/PULL_REQUEST_TEMPLATE.md` - -- [ ] **Step 1: Create the file** - -```markdown -## Summary -_What does this PR do? Why?_ - -## Type of Change -- [ ] Bug fix -- [ ] New feature -- [ ] Documentation update -- [ ] Refactor / performance improvement -- [ ] Other (describe): - -## Testing Done -_Describe how you tested this change. If you added or modified tests, list them here._ - -```bash -pytest -``` - -## Checklist -- [ ] I have read `CONTRIBUTING.md` -- [ ] My changes do not modify files under `src/` in a breaking way -- [ ] I have updated the relevant documentation (README, CHANGELOG, docstrings) -- [ ] Tests pass locally (`pytest`) -``` - -- [ ] **Step 2: Commit** - -```bash -git add .github/PULL_REQUEST_TEMPLATE.md -git commit -m "docs: add pull request template" -``` - ---- - -### Task 8: Create `CONTRIBUTING.md` - -**Files:** -- Create: `CONTRIBUTING.md` - -- [ ] **Step 1: Create the file** - -```markdown -# Contributing to EBM 5A - -Thank you for your interest in contributing. This guide covers everything you need to get started. - ---- - -## Prerequisites - -- Python 3.10+ -- A PubMed-registered e-mail address (required by NCBI API policy) -- An OpenAI-compatible API key - ---- - -## Local Setup - -```bash -# 1. Fork and clone -git clone https://github.com/your-fork/ebm5a.git -cd ebm5a - -# 2. Install full dependencies (includes torch/transformers for MedCPT) -pip install -r requirements.txt - -# 3. Configure environment -cp .env.example .env -# Edit .env with your API key, model, and PubMed email - -# 4. Verify setup -python -m src.main "Should aspirin be used for primary prevention in a 60-year-old?" -``` - -For running tests only (no GPU/torch required): - -```bash -pip install -r requirements-dev.txt -``` - ---- - -## Running Tests - -```bash -pytest # all tests -pytest --tb=short # concise failure output -pytest --cov=src --cov-report=html # with coverage report -``` - -There are currently no automated tests — the test suite is a work in progress. If you are contributing a new feature or bug fix, adding a test is strongly encouraged. - ---- - -## Code Style - -- Follow existing patterns in the file you are editing. -- No hard tabs; use 4-space indentation. -- Keep files focused: each agent, tool, and module has a single clear responsibility. - ---- - -## Commit Conventions - -Use the conventional commit format: - -``` -type: short description (imperative, ≤72 chars) -``` - -Common types: `feat`, `fix`, `docs`, `chore`, `refactor`, `test`, `ci` - -Examples: -- `feat: add support for Harm question type in Acquire agent` -- `fix: handle empty PubMed result in three-tier fallback` -- `docs: update README installation section` - ---- - -## Pull Request Process - -1. Open an issue first for non-trivial changes — discuss the approach before writing code. -2. Fork the repo, create a feature branch: `git checkout -b feat/your-feature`. -3. Keep PRs focused on a single concern. -4. Ensure `pytest` passes locally before opening the PR. -5. Fill in the PR template. -6. A maintainer will review and merge. - ---- - -## Reporting Bugs - -Use the [bug report template](.github/ISSUE_TEMPLATE/bug_report.md). -Include the full error traceback, your LLM provider and model, and the exact clinical question that triggered the issue. -``` - -- [ ] **Step 2: Commit** - -```bash -git add CONTRIBUTING.md -git commit -m "docs: add CONTRIBUTING.md" -``` - ---- - -## Chunk 3: P2 — Packaging & Doc Hygiene - -### Task 9: Create `pyproject.toml` - -**Files:** -- Create: `pyproject.toml` - -- [ ] **Step 1: Create the file** - -```toml -[build-system] -requires = ["setuptools>=68", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "ebm5a" -version = "0.1.0" -description = "Evidence-Based Medicine Clinical Decision Support System — multi-agent 5A pipeline" -readme = "README.md" -license = { file = "LICENSE" } -requires-python = ">=3.10" -keywords = ["evidence-based medicine", "clinical decision support", "EBM", "PubMed", "GRADE", "LLM", "agents"] -classifiers = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Healthcare Industry", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Topic :: Scientific/Engineering :: Medical Science Apps.", -] -dependencies = [ - "langchain==0.1.0", - "langchain-openai==0.0.5", - "langgraph==0.0.20", - "requests==2.31.0", - "python-dotenv==1.0.0", - "torch>=2.0.0", - "transformers>=4.36.0", -] - -[project.scripts] -ebm5a = "src.main:main" - -[project.urls] -Homepage = "https://github.com/your-org/ebm5a" -"Bug Tracker" = "https://github.com/your-org/ebm5a/issues" - -[tool.pytest.ini_options] -testpaths = ["tests"] -python_files = ["test_*.py"] -``` - -- [ ] **Step 2: Verify TOML is valid** - -```bash -python -c "import tomllib; tomllib.load(open('pyproject.toml','rb'))" && echo "TOML OK" -``` -Expected: `TOML OK` - -- [ ] **Step 3: Verify editable install works** - -```bash -pip install -e . --no-deps --quiet && python -c "from src.main import main; print('import OK')" -``` -Expected: `import OK` - -- [ ] **Step 4: Commit** - -```bash -git add pyproject.toml -git commit -m "chore: add pyproject.toml for package metadata and editable install" -``` - ---- - -### Task 10: Reorganise `docs/` — move internal files - -**Files:** -- Move (via `git mv`): 7 files/dirs → `docs/internal/` - -> Note: `git mv` preserves history. `docs/superpowers/` is intentionally left in place (it is a design record, not an internal dev artifact). `CHANGELOG.md` is intentionally left in root (standard open-source convention). - -- [ ] **Step 1: Create `docs/internal/` and move files** - -```bash -mkdir -p docs/internal - -git mv docs/acquire_agent_fix.md docs/internal/acquire_agent_fix.md -git mv docs/mvp_implementation_complete.md docs/internal/mvp_implementation_complete.md -git mv docs/analysis docs/internal/analysis -git mv docs/plans docs/internal/plans -git mv COMPLETION_SUMMARY.md docs/internal/COMPLETION_SUMMARY.md -git mv IMPLEMENTATION_STATUS.md docs/internal/IMPLEMENTATION_STATUS.md -git mv description.md docs/internal/description.md -``` - -- [ ] **Step 2: Verify moves completed cleanly** - -```bash -git status --short | grep "^R" -``` -Expected: 7 rename lines, one per moved item. - -- [ ] **Step 3: Verify `docs/superpowers/` and `CHANGELOG.md` are untouched** - -```bash -ls docs/superpowers/specs/ && ls CHANGELOG.md -``` -Expected: both exist with no errors. - -- [ ] **Step 4: Commit** - -`git mv` already stages the renames; commit directly without re-running `git add`: - -```bash -git commit -m "refactor: move internal dev notes to docs/internal/" -``` - ---- - -### Task 11: Create `docs/architecture.md` - -**Files:** -- Create: `docs/architecture.md` - -- [ ] **Step 1: Create the stub** - -```markdown -# Architecture Overview - -EBM 5A is a multi-agent pipeline that operationalises the Evidence-Based Medicine **5A framework** -(Ask → Acquire → Appraise → Apply → Assess) using a **ReAct** control loop. - -For the full architecture description, see the README: - -- [How It Works](../README.md#how-it-works) — pipeline diagram and scheduling rules -- [Project Structure](../README.md#project-structure) — file-level breakdown -- [Key Engineering Decisions](../README.md#key-engineering-decisions) — design rationale - -For the detailed design spec, see: -- [`docs/superpowers/specs/2026-03-16-opensource-quality-design.md`](superpowers/specs/2026-03-16-opensource-quality-design.md) -``` - -- [ ] **Step 2: Commit** - -```bash -git add docs/architecture.md -git commit -m "docs: add architecture.md stub linking to README sections" -``` - ---- - -### Task 12: Update `README.md` Documentation table - -**Files:** -- Edit: `README.md` (two tables — English and Chinese) - -- [ ] **Step 1: Add `docs/internal/` row to the English Documentation table** - -Find the English Documentation table (search for `| Architecture design`). Add one row: - -```markdown -| Internal development notes | [`docs/internal/`](docs/internal/) | -``` - -- [ ] **Step 2: Add the same row to the Chinese Documentation table** - -Find the Chinese Documentation table (search for `| 架构设计文档`). Add one row: - -```markdown -| 内部开发记录 | [`docs/internal/`](docs/internal/) | -``` - -- [ ] **Step 3: Verify both tables render correctly** - -```bash -grep -n "docs/internal" README.md -``` -Expected: 2 lines (one per language section). - -- [ ] **Step 4: Commit** - -```bash -git add README.md -git commit -m "docs: add docs/internal/ entry to README documentation tables" -``` - ---- - -## Final Verification - -- [ ] Run `git log --oneline` — expect 12+ new commits on top of the previous baseline -- [ ] Run `pip install -e . --no-deps --quiet` — expect success -- [ ] Run `python -c "from src.main import main"` — expect no import error -- [ ] Confirm `LICENSE` exists: `head -1 LICENSE` — expect `MIT License` -- [ ] Confirm no real credentials in repo: `git grep -rE "(sk-P|@gmail|huatuo)" -- ':!*.log'` — expect no output -- [ ] Confirm `.env.example` is tracked: `git ls-files .env.example` — expect `.env.example` -- [ ] Confirm `QUICKSTART.md` is tracked: `git ls-files QUICKSTART.md` — expect `QUICKSTART.md` -- [ ] Confirm CI file valid: `python -c "import yaml; yaml.safe_load(open('.github/workflows/ci.yml'))" && echo OK` -- [ ] Confirm TOML valid: `python -c "import tomllib; tomllib.load(open('pyproject.toml','rb'))" && echo OK` -- [ ] Confirm `CHANGELOG.md` still in root: `ls CHANGELOG.md` — expect `CHANGELOG.md` -- [ ] Confirm `docs/superpowers/` untouched: `ls docs/superpowers/specs/` — expect spec file listed -- [ ] Confirm `docs/internal/` populated: `ls docs/internal/` — expect 7 items diff --git a/docs/superpowers/plans/2026-03-25-web-ui-improvements.md b/docs/superpowers/plans/2026-03-25-web-ui-improvements.md deleted file mode 100644 index f18cb5d..0000000 --- a/docs/superpowers/plans/2026-03-25-web-ui-improvements.md +++ /dev/null @@ -1,1192 +0,0 @@ -# Web UI Improvements Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Apply five improvements to the TrueTruth web UI: branding, display-confusion fixes, contextual tooltips, localStorage history panel, and span-level evidence retrieval. - -**Architecture:** Pure frontend tasks (Problems 5, 4, 2, 1) modify React components and the Zustand store; they are independent of the backend. Problem 3 (span retrieval) requires coordinated changes across the Python backend (schema → local_evidence_db → acquire_agent → judge_llm → serializers) before the frontend EvidenceTable change. All tasks follow the dependency order prescribed in the spec. - -**Tech Stack:** React 18 + Zustand 5 + Vite 4 (frontend); FastAPI + Python 3.10 (backend). No test framework is installed — build verification uses `npm run build` for frontend and inline `python3 -c` scripts for backend logic. - -**Spec:** `docs/superpowers/specs/2026-03-25-web-ui-improvements-design.md` - ---- - -## File Map - -| File | Action | Reason | -|------|--------|--------| -| `web/frontend/src/App.jsx` | Modify | Branding, layout (sidebar), read-only history mode | -| `web/frontend/index.html` | Modify | `` → TrueTruth | -| `web/frontend/src/index.css` | Modify | Header styles, tooltip styles, key_sentences highlight | -| `web/backend/app.py` | Modify | `FastAPI(title=...)` → TrueTruth | -| `web/frontend/src/components/StageCard.jsx` | Modify | Call-tab dual indicator, stage subtitles, AssessOutput label | -| `web/frontend/src/components/JudgeScorePanel.jsx` | Modify | Explanatory note, severity sort + count summary, header label | -| `web/frontend/src/components/RecommendationPanel.jsx` | Modify | Quality Assessment label | -| `web/frontend/src/components/InfoTooltip.jsx` | **Create** | Fixed-position tooltip popover | -| `web/frontend/src/components/EvidenceTable.jsx` | Modify | Render `key_sentences` as primary evidence | -| `web/frontend/src/store/workflowStore.js` | Modify | `saveToHistory`, `loadFromHistory`, `historyView` flag | -| `web/frontend/src/components/HistorySidebar.jsx` | **Create** | Collapsible history list | -| `src/state/schema.py` | Modify | Add `key_sentences` field to `Evidence` | -| `src/tools/local_evidence_db.py` | Modify | Add `_extract_spans`, call from `search_local` | -| `src/agents/acquire_agent.py` | Modify | `_listwise_rank` uses `key_sentences`; fix NameError | -| `src/judge/judge_llm.py` | Modify | Exclude `key_sentences` in Appraise branch | -| `web/backend/serializers.py` | Modify | Include `key_sentences` in `serialize_evidence_list` | - ---- - -## Task 1: Branding — TrueTruth - -**Files:** -- Modify: `web/frontend/src/App.jsx` -- Modify: `web/frontend/index.html` -- Modify: `web/frontend/src/index.css` -- Modify: `web/backend/app.py` - -- [ ] **Step 1: Update index.html title** - -Open `web/frontend/index.html`. Change: -```html -<title>Vite + React -``` -to: -```html -TrueTruth -``` - -- [ ] **Step 2: Update FastAPI app title** - -In `web/backend/app.py`, change: -```python -app = FastAPI(title="EBM 5A Clinical Decision Support", version="1.0.0") -``` -to: -```python -app = FastAPI(title="TrueTruth Clinical Decision Support", version="1.0.0") -``` - -- [ ] **Step 3: Update App.jsx header** - -In `web/frontend/src/App.jsx`, replace the header div: -```jsx - {/* Header */} -

-

EBM 5A

- Clinical Decision Support - {status === 'completed' && ✓ Complete} - {status === 'error' && ✗ Error} -
-``` -with: -```jsx - {/* Header */} -
-
-

TrueTruth

- AI-Powered Clinical Evidence Synthesis -
-
- {status === 'completed' && ✓ Complete} - {status === 'error' && ✗ Error} -
-
-``` - -- [ ] **Step 4: Update header CSS** - -In `web/frontend/src/index.css`, replace the existing `.header` rule (find it by searching for `.header {`) with: -```css -.header { - display: flex; - align-items: center; - justify-content: space-between; - padding: 0 24px; - height: 72px; - background: var(--bg2); - border-bottom: 1px solid var(--border); - flex-shrink: 0; -} -.header-brand { display: flex; flex-direction: column; gap: 2px; } -.header-brand h1 { font-size: 32px; font-weight: 700; color: var(--text); margin: 0; letter-spacing: -0.5px; } -.header-subtitle { font-size: 12px; color: var(--text3); font-style: italic; } -.header-badges { display: flex; gap: 8px; align-items: center; } -``` - -- [ ] **Step 5: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` -Expected: `✓ built in ...` - -- [ ] **Step 6: Commit** - -```bash -git add web/frontend/src/App.jsx web/frontend/index.html web/frontend/src/index.css web/backend/app.py -git commit -m "feat: rebrand to TrueTruth, enlarge header" -``` - ---- - -## Task 2: Display Fix — Call Tab Dual Indicator - -**Files:** -- Modify: `web/frontend/src/components/StageCard.jsx` - -- [ ] **Step 1: Update call tab render in StageCard.jsx** - -Find the call tabs section (search for `className={`call-tab`). Replace: -```jsx - -``` -with: -```jsx - -``` - -- [ ] **Step 2: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` - -- [ ] **Step 3: Commit** - -```bash -git add web/frontend/src/components/StageCard.jsx -git commit -m "feat: show judge pass/fail and scheduling action on call tabs" -``` - ---- - -## Task 3: Display Fix — JudgeScorePanel Note and Severity Summary - -**Files:** -- Modify: `web/frontend/src/components/JudgeScorePanel.jsx` - -- [ ] **Step 1: Read the current file** - -```bash -cat /data/wuyuang/ebm5a/web/frontend/src/components/JudgeScorePanel.jsx -``` - -- [ ] **Step 2: Add explanatory note below score circle, severity summary before issue list** - -The file renders: score circle → dimension bars → issues list. Make these changes: - -After the score circle div (the one with `className="judge-overall"`), add: -```jsx -

- 总分为各维度加权平均;Minor 问题不大幅影响分数,但仍列出供参考。 -

-``` - -Before the issues list, add a severity count summary. The backend emits severity values in **lowercase** (`"critical"`, `"major"`, `"minor"`). Replace the full issues rendering block with: -```jsx - {ev.issues?.length > 0 && (() => { - const sorted = [...ev.issues].sort((a,b)=>{ - const o={critical:0,major:1,minor:2}; return (o[a.severity]??3)-(o[b.severity]??3) - }) - const counts = sorted.reduce((acc,i)=>{acc[i.severity]=(acc[i.severity]||0)+1;return acc},{}) - const summary = Object.entries(counts) - .map(([s,n])=>`${n} ${s.charAt(0).toUpperCase()+s.slice(1)}`).join(' · ') - return ( -
-

- Issues ({summary}) -

- {sorted.map((issue, i) => ( -
- {issue.severity} - {issue.dimension} - {issue.description?.slice(0,200)} -
- ))} -
- ) - })()} -``` - -Also update the "Judge Evaluation" section title (find `section-title` used for "Judge Evaluation" text) to read "Judge Score (第三方评分)": -```jsx -

Judge Score (第三方评分)

-``` - -- [ ] **Step 3: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` - -- [ ] **Step 4: Commit** - -```bash -git add web/frontend/src/components/JudgeScorePanel.jsx -git commit -m "feat: add scoring explanation note and severity summary to JudgeScorePanel" -``` - ---- - -## Task 4: Display Fix — Assess Score Labels (Required Before Task 5) - -**Files:** -- Modify: `web/frontend/src/components/StageCard.jsx` -- Modify: `web/frontend/src/components/RecommendationPanel.jsx` - -- [ ] **Step 1: Label quality ring in AssessOutput** - -In `StageCard.jsx`, find `AssessOutput` (search for `function AssessOutput`). Find the quality ring div: -```jsx -
{Math.round(score * 100)}%
-

Identified Gaps

-``` -Replace with: -```jsx -

Workflow Quality (自评)

-
{Math.round(score * 100)}%
-

Identified Gaps

-``` - -- [ ] **Step 2: Label quality ring in RecommendationPanel** - -In `RecommendationPanel.jsx`, find the "Quality Assessment" section: -```jsx - {assess && ( -
-
Quality Assessment
-``` -Replace with: -```jsx - {assess && ( -
-
Workflow Quality (自评)
-``` - -- [ ] **Step 3: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` - -- [ ] **Step 4: Commit** - -```bash -git add web/frontend/src/components/StageCard.jsx web/frontend/src/components/RecommendationPanel.jsx -git commit -m "feat: label Workflow Quality and Judge Score distinctly in Assess stage" -``` - ---- - -## Task 5: Tooltips — InfoTooltip Component - -**Files:** -- Create: `web/frontend/src/components/InfoTooltip.jsx` -- Modify: `web/frontend/src/index.css` - -- [ ] **Step 1: Create InfoTooltip.jsx** - -```jsx -// web/frontend/src/components/InfoTooltip.jsx -import { useState, useCallback } from 'react' - -export default function InfoTooltip({ text }) { - const [pos, setPos] = useState(null) - - const show = useCallback((e) => { - const r = e.currentTarget.getBoundingClientRect() - setPos({ top: r.bottom + 6, left: r.left }) - }, []) - - const hide = useCallback(() => setPos(null), []) - - return ( - - - {pos && ( -
- {text} -
- )} -
- ) -} -``` - -- [ ] **Step 2: Add CSS for tooltip** - -Append to `web/frontend/src/index.css`: -```css -/* InfoTooltip */ -.info-tooltip-wrap { position: relative; display: inline-flex; align-items: center; margin-left: 4px; } -.info-tooltip-icon { font-size: 11px; color: var(--text3); cursor: default; user-select: none; } -.info-tooltip-icon:hover { color: var(--text2); } -.info-tooltip-popover { - position: fixed; - z-index: 9999; - background: var(--bg3); - border: 1px solid var(--border); - border-radius: var(--radius); - padding: 8px 12px; - font-size: 12px; - color: var(--text2); - line-height: 1.5; - max-width: 280px; - box-shadow: 0 4px 16px rgba(0,0,0,0.4); - pointer-events: none; -} -``` - -- [ ] **Step 3: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` - -- [ ] **Step 4: Commit** - -```bash -git add web/frontend/src/components/InfoTooltip.jsx web/frontend/src/index.css -git commit -m "feat: add InfoTooltip component with fixed-position popover" -``` - ---- - -## Task 6: Tooltips — Add to Components and Stage Descriptions - -**Files:** -- Modify: `web/frontend/src/components/StageCard.jsx` -- Modify: `web/frontend/src/components/JudgeScorePanel.jsx` -- Modify: `web/frontend/src/components/DecisionBadge.jsx` -- Modify: `web/frontend/src/components/RecommendationPanel.jsx` - -- [ ] **Step 1: Add stage subtitles and tooltips in StageCard.jsx** - -Add `import InfoTooltip from './InfoTooltip'` at the top of StageCard.jsx. - -Define a constant near the top of the file (before the component functions): -```jsx -const STAGE_DESCRIPTIONS = { - Ask: '将临床问题结构化为 PICO 格式,提取检索关键词', - Acquire: '从证据库中检索相关文献段落,筛选最相关条目', - Appraise: '评估证据质量和等级(GRADE),识别研究间冲突', - Apply: '基于证据生成临床推荐意见及推荐强度', - Assess: '自评推荐质量,识别证据缺口,决定是否需要回溯', -} - -const TOOLTIPS = { - PICO: '临床问题四要素:P=患者/病症, I=干预措施, C=对照, O=结局指标', - GRADE: '证据质量分级体系:High→Moderate→Low→Very Low', - CAVEATS: '使用本推荐意见时需注意的例外、限制或特殊情况', - BACKTRACK: '当前阶段质量不足时,系统回到更早阶段重新执行', - WORKFLOW_QUALITY: 'Assess agent 对整个 workflow 输出质量的自评分,独立于 Judge Score', -} -``` - -In the `StageCard` component's panel-header, add the stage description subtitle: -```jsx -
-
- {stageName} - {STAGE_DESCRIPTIONS[stageName] && ( -
{STAGE_DESCRIPTIONS[stageName]}
- )} -
- {stage.status === 'running' && ⏳ Running…} - {call?.elapsed_s && {call.elapsed_s}s} -
-``` - -In `AskOutput`, add InfoTooltip next to the "PICO" table header. Find the table and add: -```jsx - return ( -
-
- PICO Query -
- -``` - -In `AssessOutput`, add InfoTooltip next to "Workflow Quality": -```jsx -

Workflow Quality (自评)

-``` - -- [ ] **Step 2: Add tooltips in JudgeScorePanel.jsx** - -Add `import InfoTooltip from './InfoTooltip'` at top. - -Find the "Judge Score" section title and add tooltip: -```jsx -

Judge Score (第三方评分)

-``` - -- [ ] **Step 3: Add tooltip in DecisionBadge.jsx** - -Add `import InfoTooltip from './InfoTooltip'` at top. - -**First, read `index.css` and confirm the current `.decision-row` CSS rule before editing** — the plan replaces the component root div class from `decision-row` to a plain `div`, which removes that class from the wrapper. If `.decision-row` has styles that should be preserved (border, padding, etc.), move them to a CSS rule on the inner badge span or inline style instead. - -Then replace the component's return: -```jsx - return ( -
-

- Scheduling Decision - -

-``` - -- [ ] **Step 4: Add tooltip in RecommendationPanel.jsx** - -Add `import InfoTooltip from './InfoTooltip'` at top. - -Find the "Caveats" section title and add tooltip: -```jsx -
Caveats
-``` - -Also find "Workflow Quality" section title: -```jsx -
Workflow Quality (自评)
-``` - -- [ ] **Step 5: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` - -- [ ] **Step 6: Commit** - -```bash -git add web/frontend/src/components/StageCard.jsx web/frontend/src/components/JudgeScorePanel.jsx web/frontend/src/components/DecisionBadge.jsx web/frontend/src/components/RecommendationPanel.jsx -git commit -m "feat: add stage descriptions and term tooltips across components" -``` - ---- - -## Task 7: History Panel — Store Actions - -**Files:** -- Modify: `web/frontend/src/store/workflowStore.js` - -- [ ] **Step 1: Change `create` signature and add history helpers to workflowStore.js** - -**First**, change the `create` call signature at line 26 from: -```js -export const useWorkflowStore = create((set) => ({ -``` -to: -```js -export const useWorkflowStore = create((set, get) => ({ -``` -This makes `get()` available in the factory scope (needed for `saveToHistory` and `loadFromHistory`). - -**Then** add the following helper functions at the top of the file (before `create`): - -```js -const HISTORY_KEY = 'truetruth_history' -const MAX_HISTORY = 20 -const MAX_ENTRY_BYTES = 200_000 - -function pruneStagesForStorage(stages) { - // Strip per-call logs to reduce size - const pruned = {} - for (const [name, stage] of Object.entries(stages)) { - pruned[name] = { - ...stage, - calls: stage.calls.map(c => ({ ...c, logs: [] })) - } - } - return pruned -} - -function loadHistory() { - try { return JSON.parse(localStorage.getItem(HISTORY_KEY) || '[]') } catch { return [] } -} - -function persistHistory(entries) { - try { localStorage.setItem(HISTORY_KEY, JSON.stringify(entries)) } catch {} -} -``` - -Then inside the `create((set, get) => ({` callback, add after the existing dispatch function: - -```js - history: loadHistory(), - historyView: false, - - saveToHistory(status) { - const state = get() - const entry = { - id: crypto.randomUUID(), - question: state.question, - timestamp: new Date().toISOString(), - status, - backtracks: state.backtracks, - finalResult: state.finalResult, - stages: null, - } - const withStages = { ...entry, stages: pruneStagesForStorage(state.stages) } - const serialized = JSON.stringify(withStages) - const chosen = serialized.length <= MAX_ENTRY_BYTES ? withStages : entry - const history = [chosen, ...get().history].slice(0, MAX_HISTORY) - persistHistory(history) - set({ history }) - }, - - loadFromHistory(entry) { - set({ - historyView: true, - question: entry.question, - status: entry.status === 'error' ? 'error' : 'completed', - stages: entry.stages || Object.fromEntries( - ['Ask','Acquire','Appraise','Apply','Assess'].map(n => [n, { status: 'pending', calls: [] }]) - ), - backtracks: entry.backtracks || [], - finalResult: entry.finalResult, - logs: [], - error: null, - currentAgent: null, - }) - }, - - exitHistoryView() { - set({ - historyView: false, - ...INITIAL, - stages: Object.fromEntries(STAGE_NAMES.map(n => [n, makeStage()])), - }) - }, -``` - -Note: change the `create` call signature from `create((set) => ({` to `create((set, get) => ({` to allow `get()` access. - -- [ ] **Step 2: Wire saveToHistory into dispatch** - -Inside the `dispatch` function's switch statement, at the end of `WORKFLOW_COMPLETED` case: -```js - case 'WORKFLOW_COMPLETED': { - const newState = { status: 'completed', finalResult: payload, currentAgent: null } - // Save to history (terminated if no recommendation) - setTimeout(() => { - get().saveToHistory(payload.recommendation ? 'completed' : 'terminated') - }, 0) - return newState - } -``` - -And add `WORKFLOW_ERROR` history save: -```js - case 'WORKFLOW_ERROR': { - setTimeout(() => { get().saveToHistory('error') }, 0) - return { status: 'error', error: payload.error, currentAgent: null } - } -``` - -- [ ] **Step 3: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` - -- [ ] **Step 4: Commit** - -```bash -git add web/frontend/src/store/workflowStore.js -git commit -m "feat: add history persistence to workflowStore (localStorage)" -``` - ---- - -## Task 8: History Panel — Sidebar Component and Layout - -**Files:** -- Create: `web/frontend/src/components/HistorySidebar.jsx` -- Modify: `web/frontend/src/App.jsx` -- Modify: `web/frontend/src/index.css` - -- [ ] **Step 1: Create HistorySidebar.jsx** - -```jsx -// web/frontend/src/components/HistorySidebar.jsx -import { useWorkflowStore } from '../store/workflowStore' - -const SIDEBAR_KEY = 'truetruth_sidebar_open' - -function relativeTime(iso) { - const diff = Date.now() - new Date(iso).getTime() - const m = Math.floor(diff / 60000) - if (m < 1) return '刚刚' - if (m < 60) return `${m} 分钟前` - const h = Math.floor(m / 60) - if (h < 24) return `${h} 小时前` - return `${Math.floor(h / 24)} 天前` -} - -const STATUS_ICON = { completed: '✓', terminated: '⚠', error: '✗' } -const STATUS_COLOR = { completed: 'var(--green)', terminated: 'var(--orange)', error: 'var(--red)' } - -export default function HistorySidebar({ open, onToggle }) { - const { history, loadFromHistory, exitHistoryView, historyView } = useWorkflowStore() - - return ( -
-
- {open ? '历史记录' : ''} - -
- {open && ( - <> - {historyView && ( - - )} - {history.length === 0 && ( -

暂无历史记录

- )} - {history.map(entry => ( -
loadFromHistory(entry)} - > -
{entry.question.slice(0, 60)}{entry.question.length > 60 ? '…' : ''}
-
- - {STATUS_ICON[entry.status] || '·'} - - {relativeTime(entry.timestamp)} -
-
- ))} - - )} -
- ) -} -``` - -- [ ] **Step 2: Add sidebar CSS** - -Append to `web/frontend/src/index.css`: -```css -/* History Sidebar */ -.history-sidebar { - background: var(--bg2); - border-right: 1px solid var(--border); - display: flex; - flex-direction: column; - transition: width 0.2s ease; - flex-shrink: 0; - overflow: hidden; -} -.history-sidebar.open { width: 220px; } -.history-sidebar.closed { width: 36px; } -.history-header { - display: flex; - align-items: center; - justify-content: space-between; - padding: 10px 8px; - border-bottom: 1px solid var(--border); - min-height: 40px; -} -.history-title { font-size: 12px; font-weight: 600; color: var(--text2); white-space: nowrap; } -.history-toggle { background: none; border: none; color: var(--text3); cursor: pointer; font-size: 14px; padding: 2px 4px; } -.history-toggle:hover { color: var(--text); } -.history-back-btn { - display: block; width: 100%; padding: 8px 10px; background: var(--bg3); - border: none; border-bottom: 1px solid var(--border); color: var(--blue); - font-size: 12px; cursor: pointer; text-align: left; -} -.history-back-btn:hover { background: var(--bg2); } -.history-item { - padding: 10px; border-bottom: 1px solid var(--border); - cursor: pointer; transition: background 0.1s; -} -.history-item:hover { background: var(--bg3); } -.history-item-q { font-size: 12px; color: var(--text); line-height: 1.4; margin-bottom: 4px; } -.history-item-meta { display: flex; gap: 8px; align-items: center; font-size: 11px; } -``` - -- [ ] **Step 3: Integrate sidebar into App.jsx** - -Add imports at top of App.jsx: -```jsx -import { useState } from 'react' // already present -import HistorySidebar from './components/HistorySidebar' -``` - -Add sidebar open state (after the existing `useState` calls): -```jsx - const [sidebarOpen, setSidebarOpen] = useState( - () => localStorage.getItem('truetruth_sidebar_open') !== 'false' - ) - - function toggleSidebar() { - const next = !sidebarOpen - setSidebarOpen(next) - localStorage.setItem('truetruth_sidebar_open', String(next)) - } -``` - -Also destructure `historyView` from the store: -```jsx - const { status, stages, logs, backtracks, finalResult, error, historyView } = useWorkflowStore() -``` - -Wrap the existing `
` contents in a flex layout: -```jsx - return ( -
- -
- {/* ... all existing content unchanged ... */} -
-
- ) -``` - -In history view mode, disable the question form. Find the `
` element and add: -```jsx - -``` - -- [ ] **Step 4: Add app-shell CSS** - -In `index.css`, add before the `.app` rule: -```css -.app-shell { display: flex; height: 100vh; overflow: hidden; } -.app { flex: 1; overflow-y: auto; min-width: 0; } -``` - -- [ ] **Step 5: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` - -- [ ] **Step 6: Commit** - -```bash -git add web/frontend/src/components/HistorySidebar.jsx web/frontend/src/App.jsx web/frontend/src/index.css -git commit -m "feat: add collapsible history sidebar with localStorage persistence" -``` - ---- - -## Task 9: Span Retrieval — Schema Change - -**Files:** -- Modify: `src/state/schema.py` - -- [ ] **Step 1: Add key_sentences field to Evidence** - -Read `src/state/schema.py` and find the `Evidence` dataclass. It currently ends with `full_text: Optional[str] = None`. Add after it: -```python - key_sentences: Optional[str] = None # extracted evidence span (local DB only) -``` - -- [ ] **Step 2: Verify import works** - -```bash -cd /data/wuyuang/ebm5a && python3 -c " -from src.state.schema import Evidence -e = Evidence(title='test', source='test') -print('key_sentences default:', repr(e.key_sentences)) -assert e.key_sentences is None -print('OK') -" -``` -Expected: `key_sentences default: None` and `OK` - -- [ ] **Step 3: Commit** - -```bash -git add src/state/schema.py -git commit -m "feat: add key_sentences field to Evidence dataclass" -``` - ---- - -## Task 10: Span Retrieval — Extract Spans in local_evidence_db.py - -**Files:** -- Modify: `src/tools/local_evidence_db.py` - -- [ ] **Step 1: Add _extract_spans function** - -After the `_rrf_fuse` function (before `search_local`), add: - -```python -import re as _re - -def _extract_spans(abstract_text: str, query_keywords: list, max_spans: int = 3, max_chars: int = 200) -> str | None: - """Extract the most relevant sentence spans from an abstract. - - Adjacent sentences that both contain query keywords are merged into a single span. - If >=60% of sentences are relevant, return the full abstract as one span. - - Args: - abstract_text: The article abstract. - query_keywords: Lowercase keyword tokens from the search query. - max_spans: Maximum number of spans to return. - max_chars: Maximum characters per span. - - Returns: - Concatenated spans separated by ' … ', or None if no relevant sentences found. - """ - if not abstract_text or not query_keywords: - return None - - # Split into sentences on common delimiters - sentences = [s.strip() for s in _re.split(r'(?<=[.!?。!?])\s+', abstract_text) if s.strip()] - if not sentences: - return None - - # Score each sentence by keyword overlap (case-insensitive) - kw_set = {kw.lower() for kw in query_keywords if len(kw) > 2} - scores = [] - for sent in sentences: - sent_lower = sent.lower() - score = sum(1 for kw in kw_set if kw in sent_lower) - scores.append(score) - - threshold = 1 # at least one keyword match - - # If >=60% of sentences match, return full abstract - matching = sum(1 for s in scores if s >= threshold) - if len(sentences) > 0 and matching / len(sentences) >= 0.6: - return abstract_text[:max_chars * max_spans] - - # Merge adjacent high-scoring sentences into spans - spans = [] - i = 0 - while i < len(sentences): - if scores[i] >= threshold: - # Start a new span; merge adjacent matching sentences - span_sents = [sentences[i]] - j = i + 1 - while j < len(sentences) and scores[j] >= threshold: - span_sents.append(sentences[j]) - j += 1 - span_text = ' '.join(span_sents)[:max_chars] - span_score = max(scores[i:j]) - spans.append((span_score, span_text)) - i = j - else: - i += 1 - - if not spans: - return None - - # Return top-N spans by score - spans.sort(key=lambda x: x[0], reverse=True) - return ' … '.join(text for _, text in spans[:max_spans]) -``` - -- [ ] **Step 2: Call _extract_spans in search_local** - -In `search_local`, after building each `Evidence` object (inside the loop), add span extraction. Find the `results.append(Evidence(...))` call and modify the surrounding code: - -```python - # Extract query keywords for span matching - query_keywords = [t for t in tokens if len(t) > 2] # reuse tokens from BM25 - - results: List[Evidence] = [] - n = min(top_k, len(fused)) - for rank, (pmcid, _score) in enumerate(fused[:top_k]): - a = articles.get(pmcid) - if a is None: - continue - relevance = round(1.0 - (rank / max(n, 1)) * 0.9, 3) if n > 1 else 1.0 - abstract = a.get("abstract", "") - key_sentences = _extract_spans(abstract, query_keywords) - results.append(Evidence( - title=a.get("title", ""), - source=a.get("journal", "PMC"), - pmid=a.get("pmid"), - abstract=abstract, - relevance_score=relevance, - study_type=None, - publication_date=a.get("publication_date"), - grade_level=None, - pmcid=pmcid, - full_text=a.get("full_text"), - key_sentences=key_sentences, - )) - - return results -``` - -- [ ] **Step 3: Verify span extraction works** - -```bash -cd /data/wuyuang/ebm5a && python3 -c " -from src.tools.local_evidence_db import _extract_spans -abstract = 'Preeclampsia is a serious condition. Magnesium sulfate is the drug of choice for seizure prevention. Other drugs may be used in mild cases. Blood pressure monitoring is essential. Regular urine protein tests should be performed.' -result = _extract_spans(abstract, ['magnesium', 'seizure', 'preeclampsia']) -print('span:', result) -assert result is not None -assert 'magnesium' in result.lower() or 'preeclampsia' in result.lower() -print('OK') -" -``` -Expected: prints a span containing relevant sentences and `OK`. - -- [ ] **Step 4: Commit** - -```bash -git add src/tools/local_evidence_db.py -git commit -m "feat: add span-level evidence extraction to local_evidence_db" -``` - ---- - -## Task 11: Span Retrieval — acquire_agent.py Changes - -**Files:** -- Modify: `src/agents/acquire_agent.py` - -- [ ] **Step 1: Fix NameError — initialize search_query_used before try block** - -At the start of the `execute` method in `AcquireAgent`, find the point just before the `try:` block that contains the search step. Add: -```python - search_query_used = "" -``` -This ensures the variable is always defined even if the try block raises before assignment, so the `except` handler can safely return `"search_query": search_query_used`. - -Remove the old `"search_query": filtered_query` in the except block (line ~262) and replace with `"search_query": search_query_used`. - -- [ ] **Step 2: Update _listwise_rank to prefer key_sentences** - -Find the `_listwise_rank` method. Inside the candidate block construction, find `e.abstract[:150]` and replace with: -```python -e.key_sentences if e.key_sentences else e.abstract[:150] -``` - -- [ ] **Step 3: Verify import still works** - -```bash -cd /data/wuyuang/ebm5a && python3 -c " -from src.agents.acquire_agent import AcquireAgent -print('import OK') -" -``` -Expected: `import OK` - -- [ ] **Step 4: Commit** - -```bash -git add src/agents/acquire_agent.py -git commit -m "feat: use key_sentences in listwise ranking; fix latent NameError in exception handler" -``` - ---- - -## Task 12: Span Retrieval — judge_llm.py Exclusion - -**Files:** -- Modify: `src/judge/judge_llm.py` - -- [ ] **Step 1: Exclude key_sentences in Appraise branch** - -Find the loop in the Appraise branch that does: -```python - for ev in appraisal_d.get("evidence", []): - ev.pop("abstract", None) - ev.pop("full_text", None) -``` -Add one line: -```python - ev.pop("key_sentences", None) -``` - -- [ ] **Step 2: Verify import** - -```bash -cd /data/wuyuang/ebm5a && python3 -c "from src.judge.judge_llm import JudgeLLM; print('OK')" -``` - -- [ ] **Step 3: Commit** - -```bash -git add src/judge/judge_llm.py -git commit -m "feat: exclude key_sentences from Appraise judge prompt" -``` - ---- - -## Task 13: Span Retrieval — serializers.py - -**Files:** -- Modify: `web/backend/serializers.py` - -- [ ] **Step 1: Include key_sentences in serialize_evidence_list** - -In `serialize_evidence_list`, add `key_sentences` to the dict: -```python - result.append({ - "title": e.title, - "pmid": getattr(e, "pmid", None), - "pmcid": getattr(e, "pmcid", None), - "source": getattr(e, "source", ""), - "study_type": getattr(e, "study_type", None), - "relevance_score": getattr(e, "relevance_score", 0.0), - "grade_level": getattr(e, "grade_level", None), - "abstract_preview": (getattr(e, "abstract", "") or "")[:200], - "key_sentences": getattr(e, "key_sentences", None), - }) -``` - -- [ ] **Step 2: Verify** - -```bash -cd /data/wuyuang/ebm5a && python3 -c " -from src.state.schema import Evidence -from web.backend.serializers import serialize_evidence_list -e = Evidence(title='Test', source='PMC', abstract='Hello world. This is a test.', key_sentences='Hello world.') -result = serialize_evidence_list([e]) -print('key_sentences in output:', 'key_sentences' in result[0]) -assert result[0]['key_sentences'] == 'Hello world.' -print('OK') -" -``` - -- [ ] **Step 3: Commit** - -```bash -git add web/backend/serializers.py -git commit -m "feat: include key_sentences in evidence serializer output" -``` - ---- - -## Task 14: Span Retrieval — EvidenceTable.jsx - -**Files:** -- Modify: `web/frontend/src/components/EvidenceTable.jsx` -- Modify: `web/frontend/src/index.css` - -- [ ] **Step 1: Read current EvidenceTable.jsx** - -```bash -cat /data/wuyuang/ebm5a/web/frontend/src/components/EvidenceTable.jsx -``` - -- [ ] **Step 2: Add key_sentences display** - -In the evidence row, find the `
`, replace the existing `
` abstract block with: -```jsx - {/* key_sentences highlight — shown when span extraction found relevant sentences */} - {e.key_sentences && ( -
{e.key_sentences}
- )} - {e.abstract_preview && ( -
- - {e.key_sentences ? 'Context Preview' : 'Abstract Preview'} - -
- {e.abstract_preview} -
-
- )} -``` -This entire replacement stays inside the existing `
` cell. Inside that cell, after the title/link/badge line and before the closing `` — do not remove or restructure the `` wrapper. - -- [ ] **Step 3: Add CSS for key_sentences highlight** - -Append to `web/frontend/src/index.css`: -```css -/* Key sentences evidence highlight */ -.evidence-key-sentences { - font-size: 12px; - color: var(--text); - line-height: 1.6; - margin-top: 6px; - padding: 6px 10px; - background: rgba(59,130,246,0.08); - border-left: 3px solid var(--blue); - border-radius: 0 4px 4px 0; -} -``` - -- [ ] **Step 4: Build and verify** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 | tail -5 -``` - -- [ ] **Step 5: Commit** - -```bash -git add web/frontend/src/components/EvidenceTable.jsx web/frontend/src/index.css -git commit -m "feat: display key_sentences as highlighted evidence spans in EvidenceTable" -``` - ---- - -## Task 15: Rebuild Frontend and Restart Backend - -- [ ] **Step 1: Final frontend build** - -```bash -cd /data/wuyuang/ebm5a/web/frontend && npm run build 2>&1 -``` -Expected: `✓ built in ...` with no errors or warnings. - -- [ ] **Step 2: Kill and restart backend** - -```bash -pkill -f "uvicorn web.backend.app" 2>/dev/null || true -sleep 1 -cd /data/wuyuang/ebm5a -uvicorn web.backend.app:app --port 8888 > /tmp/ebm_backend.log 2>&1 & -sleep 2 -curl -s http://localhost:8888/api/health -``` -Expected: `{"status":"ok",...}` - -- [ ] **Step 3: Smoke test SSE** - -```bash -cd /data/wuyuang/ebm5a -SESSION=$(curl -s -X POST http://localhost:8888/api/sessions \ - -H "Content-Type: application/json" \ - -d '{"question":"妊娠期高血压需要做哪些实验室检查?"}' \ - | python3 -c "import sys,json; print(json.load(sys.stdin)['session_id'])") -echo "session: $SESSION" -curl -s -N --max-time 20 "http://localhost:8888/api/run?session_id=$SESSION" | head -30 -``` -Expected: `workflow_started` event followed by `agent_started` for Ask. - -- [ ] **Step 4: Final commit** - -```bash -git add -A -git status # verify no untracked important files -git commit -m "chore: final build artifacts after web UI improvements" -``` diff --git a/docs/superpowers/plans/2026-04-03-repo-usability-improvements.md b/docs/superpowers/plans/2026-04-03-repo-usability-improvements.md deleted file mode 100644 index f0652a1..0000000 --- a/docs/superpowers/plans/2026-04-03-repo-usability-improvements.md +++ /dev/null @@ -1,1405 +0,0 @@ -# Repo Usability Improvements Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add Docker deployment, Makefile, GitHub Actions CI, setup validation, troubleshooting docs, glossary, issue/PR templates, and README Web UI docs — with no changes to agent or frontend feature code. - -**Architecture:** Each deliverable is a self-contained file. Tasks 1–5 build the Docker/Makefile foundation; Tasks 6–8 add automation and validation; Tasks 9–12 add documentation. All tasks are independent except Task 12 (README) which references commands finalised in Tasks 2–5. - -**Tech Stack:** Docker + Docker Compose, Nginx (alpine), Python 3.11-slim, Node 20-alpine, GitHub Actions, ruff (linting), pytest (test runner already in requirements.txt) - ---- - -## File Map - -| File | Action | Task | -|------|--------|------| -| `.gitignore` | Modify — add `node_modules/` | 1 | -| `.env.example` | Modify — add comments for all vars | 1 | -| `Dockerfile.backend` | Create | 2 | -| `nginx.conf` | Create | 3 | -| `Dockerfile.frontend` | Create | 3 | -| `docker-compose.yml` | Create | 4 | -| `docker-compose.dev.yml` | Create | 4 | -| `.dockerignore` | Create | 4 | -| `Makefile` | Create | 5 | -| `scripts/check_env.py` | Create | 6 | -| `.github/workflows/ci.yml` | Create | 7 | -| `docs/troubleshooting.md` | Create | 8 | -| `docs/glossary.md` | Create | 9 | -| `.github/ISSUE_TEMPLATE/bug_report.md` | Create | 10 | -| `.github/ISSUE_TEMPLATE/feature_request.md` | Create | 10 | -| `.github/pull_request_template.md` | Create | 10 | -| `README.md` | Modify — add CI badge, Docker section, Web UI section | 11 | -| `QUICKSTART.md` | Modify — add Docker as first option | 11 | - ---- - -## Task 1: Audit `.gitignore` and `.env.example` - -**Files:** -- Modify: `.gitignore` -- Modify: `.env.example` - -- [ ] **Step 1: Add `node_modules/` to `.gitignore`** - -The current `.gitignore` is missing `node_modules/`. Open `.gitignore` and add after the `venv/` line: - -``` -node_modules/ -web/frontend/dist/ -``` - -- [ ] **Step 2: Update `.env.example` with full comments** - -Replace the entire `.env.example` with: - -```bash -# Copy this file to .env and fill in your values. -# Run 'make check-env' to validate your configuration before first run. -# NEVER commit .env — it is gitignored. - -# ── Required ──────────────────────────────────────────────────────────────── - -# Base URL of your OpenAI-compatible LLM API -# OpenAI: https://api.openai.com/v1 -# Azure: https://YOUR-RESOURCE.openai.azure.com/openai/deployments/YOUR-DEPLOYMENT -# HuaTuo: https://api.huatuogpt.cn/v1 -LLM_BASE_URL=https://api.openai.com/v1 - -# Your API key for the LLM provider above -LLM_API_KEY=your_api_key_here - -# Model name — must match your provider's model identifier -# OpenAI examples: gpt-4 gpt-4o gpt-3.5-turbo -# Claude examples: claude-opus-4-6 claude-sonnet-4-6 -LLM_MODEL=gpt-4 - -# Your email address — required by NCBI/PubMed API (https://www.ncbi.nlm.nih.gov/home/develop/api/) -# NCBI will use this to contact you if your scripts cause problems. -PUBMED_EMAIL=your_email@example.com - -# ── Optional ───────────────────────────────────────────────────────────────── - -# Use a faster/cheaper model for Judge and Scheduling agents (~30–40% faster overall) -# If unset, LLM_MODEL is used for all agents. -# FAST_LLM_MODEL=gpt-3.5-turbo -``` - -- [ ] **Step 3: Commit** - -```bash -git add .gitignore .env.example -git commit -m "chore: update .gitignore (node_modules) and improve .env.example comments" -``` - ---- - -## Task 2: Create `Dockerfile.backend` - -**Files:** -- Create: `Dockerfile.backend` - -- [ ] **Step 1: Create `Dockerfile.backend`** - -```dockerfile -FROM python:3.11-slim - -WORKDIR /app - -# Install curl for Docker health check -RUN apt-get update \ - && apt-get install -y --no-install-recommends curl \ - && rm -rf /var/lib/apt/lists/* - -# Install Python deps in a separate layer for Docker cache efficiency. -# PyTorch (~2 GB) is in requirements.txt — this layer only rebuilds when deps change. -COPY requirements.txt requirements-web.txt ./ -RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \ - && pip install --no-cache-dir -r requirements.txt -r requirements-web.txt - -# Copy application source -COPY src/ ./src/ -COPY web/ ./web/ - -# Create non-root user, app directories, fix ownership. -# mkdir here so Docker named volumes inherit appuser ownership at runtime. -RUN useradd -m -u 1000 appuser \ - && mkdir -p /app/data/cache /app/logs \ - && chown -R appuser:appuser /app - -USER appuser - -EXPOSE 8000 - -CMD ["uvicorn", "web.backend.app:app", "--host", "0.0.0.0", "--port", "8000"] -``` - -Note on PyTorch: `--index-url https://download.pytorch.org/whl/cpu` installs the CPU-only wheel (~800 MB vs ~2.5 GB for CUDA). The system uses PyTorch for MedCPT re-ranking (CPU inference only). If GPU support is needed in future, change the index URL. - -- [ ] **Step 2: Verify the image builds** - -Run from the project root (takes several minutes first time due to PyTorch): - -```bash -docker build -f Dockerfile.backend -t ebm5a-backend:test . -``` - -Expected: `Successfully built ` with no errors. The final image should be ~3–4 GB. - -- [ ] **Step 3: Verify non-root user** - -```bash -docker run --rm ebm5a-backend:test whoami -``` - -Expected output: `appuser` - -- [ ] **Step 4: Clean up test image** - -```bash -docker rmi ebm5a-backend:test -``` - -- [ ] **Step 5: Commit** - -```bash -git add Dockerfile.backend -git commit -m "feat: add Dockerfile.backend (non-root, CPU-only torch)" -``` - ---- - -## Task 3: Create `nginx.conf` and `Dockerfile.frontend` - -**Files:** -- Create: `nginx.conf` -- Create: `Dockerfile.frontend` - -- [ ] **Step 1: Create `nginx.conf`** - -```nginx -server { - listen 80; - server_name _; - - root /usr/share/nginx/html; - index index.html; - - # SPA routing: React state-based navigation still benefits from this — - # prevents 404 if a user directly navigates to any path on refresh. - location / { - try_files $uri $uri/ /index.html; - } - - # Reverse proxy to FastAPI backend. - # Preserves the /api prefix — backend routes are registered as /api/sessions etc. - location /api { - proxy_pass http://backend:8000; - proxy_http_version 1.1; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header Connection ""; - - # Critical for Server-Sent Events (SSE): disable nginx buffering - # so workflow progress events are forwarded to the browser immediately. - proxy_buffering off; - proxy_cache off; - proxy_read_timeout 600s; - } -} -``` - -- [ ] **Step 2: Create `Dockerfile.frontend`** - -```dockerfile -# Stage 1: Build the React/Vite app -FROM node:20-alpine AS builder - -WORKDIR /app - -# Install deps first (separate layer — only rebuilds when package.json changes) -COPY web/frontend/package*.json ./ -RUN npm ci - -# Copy source and build -COPY web/frontend/ ./ -RUN npm run build -# Output is in /app/dist - -# Stage 2: Serve with Nginx -FROM nginx:alpine - -# Copy built static files -COPY --from=builder /app/dist /usr/share/nginx/html - -# Copy our custom Nginx config (replaces the default) -COPY nginx.conf /etc/nginx/conf.d/default.conf - -EXPOSE 80 - -CMD ["nginx", "-g", "daemon off;"] -``` - -- [ ] **Step 3: Verify frontend image builds** - -```bash -docker build -f Dockerfile.frontend -t ebm5a-frontend:test . -``` - -Expected: `Successfully built `. Final image is ~30–50 MB (nginx:alpine is tiny). - -- [ ] **Step 4: Verify nginx config is valid inside the image** - -```bash -docker run --rm ebm5a-frontend:test nginx -t -``` - -Expected: `nginx: configuration file /etc/nginx/nginx.conf test is successful` - -- [ ] **Step 5: Clean up test image** - -```bash -docker rmi ebm5a-frontend:test -``` - -- [ ] **Step 6: Commit** - -```bash -git add nginx.conf Dockerfile.frontend -git commit -m "feat: add nginx.conf (SPA routing + SSE proxy) and Dockerfile.frontend" -``` - ---- - -## Task 4: Create `docker-compose.yml`, `docker-compose.dev.yml`, `.dockerignore` - -**Files:** -- Create: `docker-compose.yml` -- Create: `docker-compose.dev.yml` -- Create: `.dockerignore` - -- [ ] **Step 1: Create `docker-compose.yml`** - -```yaml -services: - backend: - build: - context: . - dockerfile: Dockerfile.backend - env_file: .env - volumes: - # Persist PubMed cache (24h TTL) and run logs across container restarts - - ebm5a_cache:/app/data/cache - - ebm5a_logs:/app/logs - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/api/health"] - interval: 15s - timeout: 5s - retries: 5 - start_period: 40s - - frontend: - build: - context: . - dockerfile: Dockerfile.frontend - ports: - - "80:80" - depends_on: - backend: - condition: service_healthy - -volumes: - ebm5a_cache: - ebm5a_logs: -``` - -Note: The backend is not exposed on the host — all traffic goes through nginx on port 80. Frontend calls `/api/*` which nginx proxies to `backend:8000` on the internal Docker network. - -- [ ] **Step 2: Create `docker-compose.dev.yml`** - -This file is used as `docker compose -f docker-compose.yml -f docker-compose.dev.yml up` to override the backend for hot-reload development: - -```yaml -services: - backend: - volumes: - # Mount source code for hot reload (overrides image-baked code) - - ./src:/app/src - - ./web:/app/web - command: uvicorn web.backend.app:app --host 0.0.0.0 --port 8000 --reload -``` - -- [ ] **Step 3: Create `.dockerignore`** - -``` -# Version control -.git/ -.gitignore - -# Secrets — never include in image -.env - -# Python artifacts -__pycache__/ -*.pyc -*.pyo -*.pyd -.pytest_cache/ -.coverage -htmlcov/ - -# Virtual envs -.venv/ -venv/ - -# Frontend build artifacts and deps -web/frontend/node_modules/ -web/frontend/dist/ - -# Data and logs (mounted as volumes at runtime) -data/ -logs/ - -# Dev tooling -.claude/ -docs/ -*.md -``` - -- [ ] **Step 4: Validate compose config** - -```bash -docker compose config -``` - -Expected: YAML printed with no errors. Should show both `backend` and `frontend` services, two named volumes (`ebm5a_cache`, `ebm5a_logs`), and the healthcheck on backend. - -- [ ] **Step 5: End-to-end smoke test (requires a valid `.env`)** - -If you have a valid `.env` with real API keys: - -```bash -docker compose up --build -d -# Wait ~60s for backend to pass health check -docker compose ps -``` - -Expected: both services `running` or `healthy`. Then open `http://localhost` — the EBM 5A web UI should load. - -```bash -docker compose down -``` - -- [ ] **Step 6: Commit** - -```bash -git add docker-compose.yml docker-compose.dev.yml .dockerignore -git commit -m "feat: add docker-compose.yml, dev override, and .dockerignore" -``` - ---- - -## Task 5: Create `Makefile` - -**Files:** -- Create: `Makefile` - -- [ ] **Step 1: Create `Makefile`** - -```makefile -.PHONY: help dev dev-backend dev-frontend docker-up docker-down docker-logs \ - test lint format check-env cli - -.DEFAULT_GOAL := help - -help: ## Show this help - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) \ - | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-18s\033[0m %s\n", $$1, $$2}' - -# ── Development ───────────────────────────────────────────────────────────── - -dev-backend: ## Start FastAPI backend with hot reload (port 8000) - uvicorn web.backend.app:app --reload --port 8000 - -dev-frontend: ## Start Vite dev server (port 5173) - cd web/frontend && npm run dev - -dev: ## Start backend + frontend together (Ctrl+C stops both) - @trap 'kill 0' SIGINT; \ - uvicorn web.backend.app:app --reload --port 8000 & \ - (cd web/frontend && npm run dev) & \ - wait - -# ── Docker ─────────────────────────────────────────────────────────────────── - -docker-up: ## Build and start all services in the background - docker compose up --build -d - -docker-down: ## Stop all Docker services - docker compose down - -docker-logs: ## Tail logs from all Docker services - docker compose logs -f - -# ── Quality ────────────────────────────────────────────────────────────────── - -test: ## Run test suite with pytest - pytest tests/ --tb=short -q - -lint: ## Check code style (ruff) - ruff check src/ web/backend/ - -format: ## Auto-format code (ruff) - ruff format src/ web/backend/ - -# ── Utilities ──────────────────────────────────────────────────────────────── - -check-env: ## Validate .env before running (run this first!) - python scripts/check_env.py - -cli: ## Run a clinical query via CLI (usage: make cli QUERY="your question") - python -m src.main "$(QUERY)" -``` - -- [ ] **Step 2: Verify help output** - -```bash -make help -``` - -Expected output (colours in terminal): -``` - dev-backend Start FastAPI backend with hot reload (port 8000) - dev-frontend Start Vite dev server (port 5173) - dev Start backend + frontend together (Ctrl+C stops both) - docker-up Build and start all services in the background - docker-down Stop all Docker services - docker-logs Tail logs from all Docker services - test Run test suite with pytest - lint Check code style (ruff) - format Auto-format code (ruff) - check-env Validate .env before running (run this first!) - cli Run a clinical query via CLI (usage: make cli QUERY="your question") -``` - -- [ ] **Step 3: Commit** - -```bash -git add Makefile -git commit -m "feat: add Makefile with dev, docker, lint, test, and check-env targets" -``` - ---- - -## Task 6: Create `scripts/check_env.py` - -**Files:** -- Create: `scripts/check_env.py` - -- [ ] **Step 1: Create `scripts/check_env.py`** - -```python -#!/usr/bin/env python3 -"""Validate .env configuration before running EBM 5A. - -Usage: - python scripts/check_env.py - make check-env - -Exit code 0: all required checks passed. -Exit code 1: one or more required checks failed. -""" - -import importlib.util -import os -import re -import sys -import urllib.request -from pathlib import Path - -OK = "[✓]" -FAIL = "[✗]" -WARN = "[~]" - -_errors = 0 - - -def ok(msg: str) -> None: - print(f"{OK} {msg}") - - -def fail(msg: str, hint: str) -> None: - global _errors - _errors += 1 - print(f"{FAIL} {msg}") - print(f" → {hint}") - - -def warn(msg: str) -> None: - print(f"{WARN} {msg}") - - -# ── 1. Load .env ───────────────────────────────────────────────────────────── - -env_path = Path(".env") -if not env_path.exists(): - fail( - ".env file not found", - "Run: cp .env.example .env then fill in your values", - ) - sys.exit(1) - -ok(".env file found") - -env_vars: dict[str, str] = {} -for line in env_path.read_text(encoding="utf-8").splitlines(): - line = line.strip() - if line and not line.startswith("#") and "=" in line: - key, _, value = line.partition("=") - env_vars[key.strip()] = value.strip().strip('"').strip("'") - -os.environ.update(env_vars) - -# ── 2. LLM_API_KEY ─────────────────────────────────────────────────────────── - -api_key = os.getenv("LLM_API_KEY", "") -if not api_key or api_key in ("your_api_key_here", ""): - fail( - "LLM_API_KEY not set or still placeholder", - "Add LLM_API_KEY= to .env", - ) -else: - ok("LLM_API_KEY is set") - -# ── 3. LLM_BASE_URL ────────────────────────────────────────────────────────── - -base_url = os.getenv("LLM_BASE_URL", "") -if not base_url: - fail( - "LLM_BASE_URL not set", - "Add LLM_BASE_URL=https://api.openai.com/v1 to .env", - ) -else: - try: - req = urllib.request.Request(base_url, method="HEAD") - urllib.request.urlopen(req, timeout=5) - ok(f"LLM_BASE_URL reachable ({base_url})") - except Exception as e: - # Many providers return 4xx on HEAD /v1 — that still means the host is up - code = getattr(e, "code", None) - if code is not None and code < 500: - ok(f"LLM_BASE_URL reachable — HTTP {code} (normal for this endpoint)") - else: - fail( - f"LLM_BASE_URL not reachable: {e}", - "Check LLM_BASE_URL in .env — is the server running / accessible?", - ) - -# ── 4. PUBMED_EMAIL ────────────────────────────────────────────────────────── - -email = os.getenv("PUBMED_EMAIL", "") -if not email or not re.match(r"[^@\s]+@[^@\s]+\.[^@\s]+", email): - fail( - "PUBMED_EMAIL not set or invalid format", - "Add PUBMED_EMAIL=your@email.com to .env (required by NCBI API)", - ) -else: - ok("PUBMED_EMAIL format valid") - -# ── 5. Python version ──────────────────────────────────────────────────────── - -vi = sys.version_info -if vi < (3, 10): - fail( - f"Python {vi.major}.{vi.minor} — need 3.10+", - "Upgrade Python: https://www.python.org/downloads/", - ) -else: - ok(f"Python {vi.major}.{vi.minor}.{vi.micro} >= 3.10") - -# ── 6. Core dependencies ───────────────────────────────────────────────────── - -required_pkgs = { - "langchain": "langchain", - "torch": "torch", - "fastapi": "fastapi", - "uvicorn": "uvicorn", -} - -missing = [name for name, pkg in required_pkgs.items() - if importlib.util.find_spec(pkg) is None] - -if missing: - fail( - f"Missing packages: {', '.join(missing)}", - "Run: pip install -r requirements.txt -r requirements-web.txt", - ) -else: - ok("Core dependencies installed (langchain, torch, fastapi, uvicorn)") - -# ── 7. Optional: FAST_LLM_MODEL ────────────────────────────────────────────── - -if not os.getenv("FAST_LLM_MODEL"): - warn( - "FAST_LLM_MODEL not set (optional) — " - "Judge/Scheduling will use LLM_MODEL; set a faster model for ~30% speedup" - ) -else: - ok(f"FAST_LLM_MODEL = {os.getenv('FAST_LLM_MODEL')}") - -# ── Summary ─────────────────────────────────────────────────────────────────── - -print() -if _errors: - print(f"❌ {_errors} required check(s) failed — fix the above before running.") - sys.exit(1) -else: - print("✅ All required checks passed. Ready to run.") -``` - -- [ ] **Step 2: Run against a valid `.env` — expect all green** - -```bash -python scripts/check_env.py -``` - -Expected (with a valid `.env`): -``` -[✓] .env file found -[✓] LLM_API_KEY is set -[✓] LLM_BASE_URL reachable (https://api.openai.com/v1) -[✓] PUBMED_EMAIL format valid -[✓] Python 3.11.x >= 3.10 -[✓] Core dependencies installed (langchain, torch, fastapi, uvicorn) -[~] FAST_LLM_MODEL not set (optional) — ... - -✅ All required checks passed. Ready to run. -``` - -- [ ] **Step 3: Test failure path — rename `.env` temporarily** - -```bash -mv .env .env.bak -python scripts/check_env.py -mv .env.bak .env -``` - -Expected: script prints `[✗] .env file not found` and exits with code 1: -```bash -echo $? # should print: 1 -``` - -- [ ] **Step 4: Commit** - -```bash -git add scripts/check_env.py -git commit -m "feat: add scripts/check_env.py — validates .env before first run" -``` - ---- - -## Task 7: Create `.github/workflows/ci.yml` - -**Files:** -- Create: `.github/workflows/ci.yml` - -- [ ] **Step 1: Create the directory and workflow file** - -```bash -mkdir -p .github/workflows -``` - -Create `.github/workflows/ci.yml`: - -```yaml -name: CI - -on: - push: - branches: [main] - pull_request: - branches: [main] - -jobs: - # ── Job 1: Lint ───────────────────────────────────────────────────────────── - lint: - name: Lint - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Install ruff - run: pip install ruff - - - name: Check style (ruff) - run: ruff check src/ web/backend/ - - - name: Check formatting (ruff) - run: ruff format --check src/ web/backend/ - - # ── Job 2: Test ───────────────────────────────────────────────────────────── - test: - name: Test - runs-on: ubuntu-latest - env: - # Placeholder values — CI never calls real APIs. - # POLICY: do NOT replace these with real keys in YAML or GitHub Secrets. - # Real API keys would cause live (costly) calls on every PR. - LLM_API_KEY: ci-placeholder-not-real - LLM_BASE_URL: https://api.openai.com/v1 - LLM_MODEL: gpt-4 - PUBMED_EMAIL: ci@example.com - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - cache: pip - cache-dependency-path: | - requirements.txt - requirements-web.txt - - - name: Install PyTorch (CPU-only wheel — faster than default CUDA wheel) - run: pip install torch --index-url https://download.pytorch.org/whl/cpu - - - name: Install remaining dependencies - run: pip install -r requirements.txt -r requirements-web.txt - - - name: Run tests - # Exit code 5 = "no tests collected" — treat as pass until tests are added. - # Real failures (exit code 1) still fail CI. - run: | - pytest tests/ --tb=short -q - STATUS=$? - [ $STATUS -eq 5 ] && exit 0 || exit $STATUS - shell: bash - - # ── Job 3: Docker Build ───────────────────────────────────────────────────── - docker-build: - name: Docker Build - runs-on: ubuntu-latest - needs: [lint, test] - steps: - - uses: actions/checkout@v4 - - - uses: docker/setup-buildx-action@v3 - - - name: Build backend image - uses: docker/build-push-action@v5 - with: - context: . - file: Dockerfile.backend - push: false - cache-from: type=gha - cache-to: type=gha,mode=max - - - name: Build frontend image - uses: docker/build-push-action@v5 - with: - context: . - file: Dockerfile.frontend - push: false - cache-from: type=gha - cache-to: type=gha,mode=max -``` - -- [ ] **Step 2: Validate YAML syntax locally** - -```bash -python -c "import yaml; yaml.safe_load(open('.github/workflows/ci.yml')); print('YAML valid')" -``` - -Expected: `YAML valid` - -- [ ] **Step 3: Commit** - -```bash -git add .github/workflows/ci.yml -git commit -m "feat: add GitHub Actions CI (lint + test + docker-build with caching)" -``` - -After pushing to GitHub, the Actions tab will show the workflow running. First run of the `test` job will be slow (~10–15 min) due to PyTorch download; subsequent runs use the pip cache (~2–3 min). - ---- - -## Task 8: Create `docs/troubleshooting.md` - -**Files:** -- Create: `docs/troubleshooting.md` - -- [ ] **Step 1: Create `docs/troubleshooting.md`** - -```markdown -# Troubleshooting - -Common issues and how to fix them. - ---- - -## Setup Errors - -### `.env` file not found - -**Symptom:** `FileNotFoundError: .env not found` or `make check-env` reports missing file. - -**Fix:** -```bash -cp .env.example .env -# Then edit .env and fill in your LLM_API_KEY and PUBMED_EMAIL -``` - ---- - -### `LLM_API_KEY` invalid or quota exceeded - -**Symptom:** `AuthenticationError`, `401 Unauthorized`, or `429 Too Many Requests` in logs. - -**Fix:** -- Verify the key in `.env` matches your provider's format. -- Check your API quota / billing dashboard. -- If using a custom `LLM_BASE_URL`, ensure the base URL does not include a trailing `/chat/completions` — it should end at `/v1`. - ---- - -### `LLM_BASE_URL` unreachable - -**Symptom:** `ConnectionError` or `make check-env` reports `[✗] LLM_BASE_URL not reachable`. - -**Fix:** -- Check the URL is reachable from your machine: `curl -I https://your-provider/v1` -- If behind a proxy, ensure `HTTPS_PROXY` is set in your environment. -- If using a local LLM server (e.g., Ollama), ensure it is running. - ---- - -## PubMed Issues - -### Rate limiting (`HTTP 429` from PubMed) - -**Symptom:** `429` errors in logs during the Acquire stage. - -**Cause:** NCBI limits unauthenticated requests to 3/second. The client respects this by default, but network latency variations can occasionally trigger it. - -**Fix:** This is usually transient — the next run will succeed. If persistent, register for an [NCBI API key](https://www.ncbi.nlm.nih.gov/account/) (allows 10 req/s). - ---- - -### PubMed returns no results - -**Symptom:** Acquire stage completes with 0 articles; Apply stage receives no evidence. - -**Causes:** -- The clinical question uses highly specific terminology not present in PubMed MeSH terms. Try rephrasing. -- `PUBMED_EMAIL` is unset or invalid — NCBI may silently throttle requests without a valid email. - ---- - -## Runtime Behaviour - -### A run takes 5–10 minutes — is it stuck? - -**No, this is normal.** Each stage involves one or more LLM calls: -- Ask: ~10s -- Acquire: ~30–60s (PubMed fetch + MedCPT re-ranking) -- Appraise: ~60–120s (parallel LLM calls for up to 10 articles) -- Apply: ~30–90s (may retry if Judge score < 0.7) -- Assess: ~20s - -Total: 2–10 minutes depending on model speed and evidence complexity. - -The CLI prints `[TIMING]` lines at each stage. The Web UI shows live progress. - ---- - -### Backtrack events in logs — is something wrong? - -**No, backtracks are by design.** When a stage scores below the Judge threshold (0.7/1.0), the Scheduling LLM may decide to retry the stage or backtrack to a previous stage. This is the quality-gating mechanism working correctly. - -If a run produces more than 3–4 backtracks and never completes, the question may be outside the system's evidence coverage — it will eventually return `Insufficient Evidence`. - ---- - -### `[FAST-PATH]` in logs — what does this mean? - -The coordinator detected that the current stage can be skipped: -- `FAST-PATH`: `pass_threshold=True` and no critical/major issues → proceed without calling the Scheduling LLM. -- `FAST-PATH-2`: The current set of major issues has been seen before (loop detected) → auto-proceed to prevent infinite loops. - -Both are expected behaviour. - ---- - -## Web UI Issues - -### Frontend loads but API calls fail (network error) - -**Symptom:** Web UI shows "Failed to start session" immediately after submitting a question. - -**Cause (manual dev mode):** The frontend dev server (port 5173) calls the backend (port 8000) cross-origin. The backend allows `*` CORS, but the browser may block it in some configurations. - -**Fix options:** -1. Use Docker mode (`make docker-up`) — nginx handles the proxy on the same origin. -2. Ensure the backend is actually running: `make dev-backend` in a separate terminal. - ---- - -### Blank page at `http://localhost` (Docker mode) - -**Cause:** Frontend container started before backend passed its health check. - -**Fix:** -```bash -docker compose down -docker compose up --build -d -# Wait 30–60 seconds, then refresh -docker compose ps # both services should show "healthy" or "running" -``` - ---- - -### SSE stream stops mid-workflow in Docker - -**Symptom:** Progress updates stop after a few events; the browser shows the connection closed. - -**Cause:** Nginx has a default `proxy_read_timeout` of 60s, which may expire for long workflows. - -Our `nginx.conf` sets `proxy_read_timeout 600s` (10 min) which should be sufficient. If you modified `nginx.conf`, ensure this setting is present. - ---- - -## Log Interpretation - -| Log pattern | Meaning | -|-------------|---------| -| `[TIMING] Acquire: 45.2s` | Stage took 45.2 seconds | -| `[FAST-PATH] proceed` | Skipped Scheduling LLM — stage passed cleanly | -| `[FAST-PATH-2] loop detected, auto-proceed` | Repeated major-issue pattern — forced proceed | -| `Judge score: 0.82 / threshold: 0.70` | Stage passed quality gate | -| `Judge score: 0.61 / threshold: 0.70` | Stage failed — Scheduling LLM will decide next action | -| `Backtrack to Acquire` | System re-running Acquire with a revised query | -| `Insufficient Evidence` | Final result — no recommendation was forced | -``` - -- [ ] **Step 2: Commit** - -```bash -git add docs/troubleshooting.md -git commit -m "docs: add troubleshooting guide (setup, PubMed, runtime, Web UI, logs)" -``` - ---- - -## Task 9: Create `docs/glossary.md` - -**Files:** -- Create: `docs/glossary.md` - -- [ ] **Step 1: Create `docs/glossary.md`** - -```markdown -# Glossary - -Key terms used in EBM 5A and the Evidence-Based Medicine framework. - ---- - -## 5A Framework - -The international EBM workflow operationalised by this system: - -| Stage | Full name | What it does | -|-------|-----------|-------------| -| **Ask** | Ask a structured question | Converts a free-text clinical question into a structured PICO format and identifies the question type | -| **Acquire** | Acquire the evidence | Searches PubMed with appropriate filters, re-ranks results with MedCPT | -| **Appraise** | Appraise the evidence | Rates each article's study type and assigns a GRADE evidence level | -| **Apply** | Apply to the patient | Synthesises the evidence into a recommendation with strength and quality ratings | -| **Assess** | Assess the outcome | Reviews the full workflow and produces a final structured summary | - ---- - -## PICO - -A framework for structuring clinical questions: - -- **P** — Patient / Population / Problem -- **I** — Intervention (treatment, test, exposure) -- **C** — Comparison (alternative intervention, placebo, or no treatment) -- **O** — Outcome (what you are trying to measure or achieve) - -Example: *"In [P: 68-year-old with NSTEMI and GI bleed], does [I: DAPT] compared to [C: clopidogrel monotherapy] reduce [O: recurrent MI] without increasing [O: GI bleeding]?"* - ---- - -## Question Types - -EBM 5A automatically identifies the question type during the Ask stage to apply the appropriate PubMed search filter: - -| Type | Description | Search filter used | -|------|-------------|-------------------| -| **Therapy** | Does treatment X work better than Y? | High Sensitivity Search Strategy (HSSS) — RCTs and SRs | -| **Diagnosis** | How accurate is test X for condition Y? | Diagnostic test accuracy studies | -| **Prognosis** | What is the likely outcome for a patient with X? | Observational studies (cohort) | -| **Harm** | Does exposure X cause harm Y? | Observational studies (cohort + case-control) | -| **Prevention** | Does intervention X prevent condition Y? | RCTs and observational studies | - ---- - -## GRADE Evidence Quality - -GRADE (Grading of Recommendations Assessment, Development, and Evaluation) is the international standard for rating evidence quality. In EBM 5A, GRADE levels are **computed by deterministic Python code** — the LLM classifies study types and design features; Python calculates the final grade. - -| Level | Meaning | Typical study types | -|-------|---------|-------------------| -| **High** | Very confident the effect estimate is close to the true effect | Systematic review / meta-analysis, well-designed RCT | -| **Moderate** | Moderately confident; true effect likely close to estimate, but may differ | RCT with limitations, well-designed observational | -| **Low** | Limited confidence; true effect may differ substantially | Observational study (cohort, case-control) | -| **Very Low** | Very little confidence in the effect estimate | Case series, expert opinion, narrative review | - -Factors that **downgrade** evidence: risk of bias, inconsistency, indirectness, imprecision, publication bias. -Factors that **upgrade** evidence: large effect size, dose-response gradient, all plausible confounders reduce effect. - ---- - -## Recommendation Strength - -The Apply agent assigns a recommendation strength based on evidence quality and clinical context: - -| Strength | Meaning | When used | -|----------|---------|-----------| -| **Strong** | Benefits clearly outweigh harms for most patients | High/Moderate GRADE evidence with consistent direction | -| **Conditional** | Benefits probably outweigh harms, but uncertainty exists | Lower GRADE evidence, indirect evidence, or significant patient variability | -| **Consensus-based** | No direct evidence; based on clinical guidelines or expert consensus | Diagnosis questions, topics covered by major guidelines (ESC, AHA, etc.) | -| **Insufficient Evidence** | Cannot make a recommendation — evidence is absent, conflicting, or too weak | No relevant studies retrieved or all studies critically flawed | - ---- - -## Judge Score - -Each stage's output is evaluated by the Judge LLM, which produces a score from 0.0 to 1.0. - -- **Threshold:** 0.70 — stages scoring below this threshold are flagged for retry or backtrack. -- **Composition:** The Judge classifies individual quality dimensions as `pass` / `minor` / `major` / `critical`. Python code converts these labels to a numerical score. -- **Purpose:** Prevents low-quality intermediate outputs from propagating to the final recommendation. - ---- - -## ReAct Loop - -**Re**asoning + **Act**ing — the control loop pattern used by EBM 5A's coordinator: - -1. Run stage → produce output -2. Judge scores the output -3. Scheduling LLM decides: proceed / retry / backtrack -4. Repeat until all stages pass or max iterations reached - -This loop ensures quality gates are enforced at every stage and allows the system to recover from poor intermediate outputs. - ---- - -## MedCPT - -A biomedical dense retrieval model (from NCBI) used to re-rank PubMed search results by relevance to the clinical question. Runs locally using PyTorch (CPU inference). Improves article relevance compared to keyword-only BM25 ranking. -``` - -- [ ] **Step 2: Commit** - -```bash -git add docs/glossary.md -git commit -m "docs: add glossary (5A, PICO, GRADE, recommendation strength, Judge score, ReAct)" -``` - ---- - -## Task 10: Create GitHub Issue and PR Templates - -**Files:** -- Create: `.github/ISSUE_TEMPLATE/bug_report.md` -- Create: `.github/ISSUE_TEMPLATE/feature_request.md` -- Create: `.github/pull_request_template.md` - -- [ ] **Step 1: Create `.github/ISSUE_TEMPLATE/bug_report.md`** - -```bash -mkdir -p .github/ISSUE_TEMPLATE -``` - -```markdown ---- -name: Bug Report -about: Something isn't working as expected -labels: bug ---- - -## Environment - -- **OS:** (e.g. Ubuntu 22.04, macOS 14, Windows 11) -- **Python version:** (e.g. 3.11.2) — run `python --version` -- **Interface:** CLI / Web UI / Both -- **LLM provider:** (e.g. OpenAI, Azure OpenAI, HuaTuoGPT, local Ollama) -- **EBM 5A version / commit:** (run `git log --oneline -1`) - -## Steps to Reproduce - -1. -2. -3. - -## Expected Behaviour - -What should have happened. - -## Actual Behaviour - -What actually happened. - -## Relevant Log Output - -Paste the relevant section from `logs/` or the terminal output. -Use a code block: - -``` -[paste log here] -``` - -## Additional Context - -Any other context (screenshots, related issues, config details). -``` - -- [ ] **Step 2: Create `.github/ISSUE_TEMPLATE/feature_request.md`** - -```markdown ---- -name: Feature Request -about: Suggest a new feature or improvement -labels: enhancement ---- - -## What would you like? - -A clear description of the feature you are requesting. - -## Why is this useful? - -Describe the use case or motivation. Who benefits, and how? - -## Possible implementation approach (optional) - -If you have ideas about how it could be implemented, share them here. -``` - -- [ ] **Step 3: Create `.github/pull_request_template.md`** - -```markdown -## Summary - - - -- -- - -## Related Issue - -Closes # - -## How to Test - - - -1. -2. - -## Checklist - -- [ ] `make lint` passes -- [ ] `make test` passes (or no tests affected) -- [ ] Docs updated if behaviour changed -- [ ] `.env.example` updated if new env vars added -``` - -- [ ] **Step 4: Commit** - -```bash -git add .github/ISSUE_TEMPLATE/ .github/pull_request_template.md -git commit -m "chore: add GitHub issue and PR templates" -``` - ---- - -## Task 11: Update `README.md` and `QUICKSTART.md` - -**Files:** -- Modify: `README.md` -- Modify: `QUICKSTART.md` - -- [ ] **Step 1: Add CI badge to `README.md`** - -Find the existing badges block (lines 8–12 in the current file): - -```markdown -[![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) -[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![LangChain](https://img.shields.io/badge/LangChain-0.1.0-green.svg)](https://python.langchain.com/) -[![OpenAI Compatible](https://img.shields.io/badge/API-OpenAI%20Compatible-412991.svg)](https://platform.openai.com/) -[![PubMed](https://img.shields.io/badge/data-PubMed%20Real--time-326599.svg)](https://pubmed.ncbi.nlm.nih.gov/) -``` - -Replace with (add CI badge as the first badge; replace `YOUR_GITHUB_USERNAME` with the actual GitHub username): - -```markdown -[![CI](https://github.com/YOUR_GITHUB_USERNAME/ebm5a/actions/workflows/ci.yml/badge.svg)](https://github.com/YOUR_GITHUB_USERNAME/ebm5a/actions/workflows/ci.yml) -[![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) -[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![LangChain](https://img.shields.io/badge/LangChain-0.1.0-green.svg)](https://python.langchain.com/) -[![OpenAI Compatible](https://img.shields.io/badge/API-OpenAI%20Compatible-412991.svg)](https://platform.openai.com/) -[![PubMed](https://img.shields.io/badge/data-PubMed%20Real--time-326599.svg)](https://pubmed.ncbi.nlm.nih.gov/) -``` - -Note: find the actual GitHub username with `git remote -v`. Replace `YOUR_GITHUB_USERNAME` and the repo name accordingly. - -- [ ] **Step 2: Add Docker Quick Start and Web UI sections to `README.md`** - -Find the English section that begins with `### What is EBM 5A?`. Insert the following **before** this heading (immediately after the `` anchor line): - -```markdown -### Quick Start - -**Docker (recommended — one command, no environment setup):** - -```bash -cp .env.example .env # fill in LLM_API_KEY, PUBMED_EMAIL -make docker-up # builds and starts backend + frontend -# Open http://localhost -``` - -**Manual (CLI only):** - -```bash -pip install -r requirements.txt -cp .env.example .env # fill in your values -make check-env # validate configuration -make cli QUERY="68-year-old with NSTEMI and acute GI bleed: DAPT or clopidogrel monotherapy?" -``` - -### Interfaces - -| Interface | How to start | URL | -|-----------|-------------|-----| -| **Web UI** (Docker) | `make docker-up` | http://localhost | -| **Web UI** (manual) | `make dev-backend` + `make dev-frontend` | http://localhost:5173 | -| **CLI** | `make cli QUERY="..."` | — | - -The Web UI provides real-time workflow visualisation, stage-by-stage scores, evidence tables, and history. The CLI outputs the full audit trail to `logs/`. - -See [docs/troubleshooting.md](docs/troubleshooting.md) for common issues and [docs/glossary.md](docs/glossary.md) for GRADE/PICO/recommendation strength definitions. - ---- -``` - -- [ ] **Step 3: Add Chinese equivalents to `README.md`** - -Find the Chinese section anchor `` (search for it). After the Chinese section header that corresponds to "What is EBM 5A?", insert the Chinese equivalent: - -```markdown -### 快速开始 - -**Docker(推荐——一行命令,无需配置环境):** - -```bash -cp .env.example .env # 填写 LLM_API_KEY 和 PUBMED_EMAIL -make docker-up # 构建并启动后端 + 前端 -# 访问 http://localhost -``` - -**手动(仅 CLI):** - -```bash -pip install -r requirements.txt -cp .env.example .env # 填写相关配置 -make check-env # 验证配置 -make cli QUERY="68岁男性,NSTEMI合并急性消化道出血:DAPT还是单用氯吡格雷?" -``` - -### 界面 - -| 界面 | 启动方式 | 访问地址 | -|------|---------|--------| -| **Web UI**(Docker) | `make docker-up` | http://localhost | -| **Web UI**(手动) | `make dev-backend` + `make dev-frontend` | http://localhost:5173 | -| **CLI** | `make cli QUERY="..."` | — | - -Web UI 提供实时工作流可视化、逐阶段评分、证据表格和历史记录。CLI 将完整审计日志输出到 `logs/`。 - -常见问题请参阅 [docs/troubleshooting.md](docs/troubleshooting.md);GRADE / PICO / 推荐强度等术语请参阅 [docs/glossary.md](docs/glossary.md)。 - ---- -``` - -- [ ] **Step 4: Update `QUICKSTART.md` — add Docker as the first option** - -The current `QUICKSTART.md` starts with `## 1. 配置环境`. Insert a new section **before** this, at the very top after the `# EBM 5A系统快速开始指南` title: - -```markdown -## 方式一:Docker(推荐) - -最快的上手方式——一行命令启动前后端,无需手动配置 Python 环境。 - -**前提:** 安装 [Docker Desktop](https://www.docker.com/products/docker-desktop/) 或 Docker Engine + Docker Compose。 - -```bash -# 1. 复制并填写配置文件 -cp .env.example .env -# 用编辑器打开 .env,填写 LLM_API_KEY 和 PUBMED_EMAIL - -# 2. 启动 -make docker-up - -# 3. 访问 Web UI -# 浏览器打开 http://localhost -``` - -停止服务: -```bash -make docker-down -``` - -查看日志: -```bash -make docker-logs -``` - ---- - -## 方式二:手动安装 -``` - -Then rename the existing `## 1. 配置环境` heading to keep it as part of "方式二" by adding it right after the `## 方式二:手动安装` line (no change to content, just continuity). - -- [ ] **Step 5: Commit** - -```bash -git add README.md QUICKSTART.md -git commit -m "docs: add Docker quick start, Web UI section, CI badge to README and QUICKSTART" -``` - ---- - -## Self-Review Checklist - -After completing all tasks, verify: - -- [ ] `make help` shows all expected commands -- [ ] `make check-env` runs without crash (exit 0 with valid .env, exit 1 without) -- [ ] `docker compose config` validates without error -- [ ] `docker compose build` succeeds for both images -- [ ] `python -c "import yaml; yaml.safe_load(open('.github/workflows/ci.yml'))"` is valid -- [ ] `docs/troubleshooting.md` and `docs/glossary.md` exist and render correctly -- [ ] `.github/ISSUE_TEMPLATE/` has two files; `.github/pull_request_template.md` exists -- [ ] README shows CI badge, Docker Quick Start, and Web UI table -- [ ] `QUICKSTART.md` has Docker section as the first option diff --git a/docs/superpowers/plans/2026-04-20-22-full-redesign.md b/docs/superpowers/plans/2026-04-20-22-full-redesign.md deleted file mode 100644 index 64a79d8..0000000 --- a/docs/superpowers/plans/2026-04-20-22-full-redesign.md +++ /dev/null @@ -1,321 +0,0 @@ -# 4/20-4/22 全量重设计实现计划 - -> **For agentic workers:** Use superpowers:subagent-driven-development to implement task-by-task. - -**Goal:** Ask 路由重设计、Acquire PMC+RAG、Appraise GRADE 修正、Apply 对齐、Judge Gate+Rubrics 架构重写。 - -**原则:** 每个 Task 先写失败测试(如适用),再改代码,再验证,不需要commit。 - ---- - -## 文件改动清单 - -| 文件 | 操作 | -|---|---| -| src/state/schema.py | 新增 EBMQuery;WorkflowState 路由字段;Evidence.has_full_text | -| src/config/prompts/ask/ | 新建目录 + 8 个 prompt 文件 | -| src/agents/ask_agent.py | 重写:路由→分支→统一输出 | -| src/coordinator/coordinator.py | direct_answer 提前终止 | -| src/tools/pubmed_api.py | 新增 fetch_pmc_full_text | -| src/agents/acquire_agent.py | EBMQuery 适配;PMC 全文;BM25+Embedding RAG | -| src/agents/appraise_agent.py | _compute_grade 重写;SR 动态初始分;升级阻断 | -| src/config/prompts/appraise_agent.txt | 新增 included_study_type、confounding_bias_mitigates | -| src/agents/apply_agent.py | route_type 注入;结构化 GRADE;inconsistency 强制规则 | -| src/config/prompts/apply_agent.txt | 路由维度检查;结构化 GRADE 输入变量 | -| src/config/prompts/judge/ask_judge.txt | 重写:Gate+Rubrics | -| src/config/prompts/judge/acquire_judge.txt | 重写:Gate+Rubrics | -| src/config/prompts/judge/appraise_judge.txt | 重写:Gate+Rubrics | -| src/config/prompts/judge/apply_judge.txt | 重写:Gate+Rubrics | -| src/config/prompts/judge/assess_judge.txt | 新增路由字段输入;route_confidence_noted | -| src/judge/judge_llm.py | Gate 检查;Rubric 评分;RUBRIC_WEIGHTS | -| tests/test_judge_rubrics.py | 新建:Gate+Rubrics 单元测试 | -| tests/test_integration_routing.py | 新建:路由集成测试 | - ---- - -## Task 1: schema.py — EBMQuery + routing fields + Evidence.has_full_text - -**Files:** `src/state/schema.py` - -- [ ] 在 PICOQuery 之后插入 EBMQuery dataclass(字段:query_type, patient, primary_focus, outcome, keywords, comparator=None, reference_standard=None, time_horizon=None) -- [ ] Evidence 新增 `has_full_text: bool = False` -- [ ] WorkflowState 新增:route_type, route_confidence, direct_answer_output, ebm_query, sub_pico_queries, sub_question_index, sub_question_total -- [ ] 验证:`python3 -c "from src.state.schema import EBMQuery, WorkflowState, Evidence; print('OK')"` -- [ ] `git commit -m "feat(schema): EBMQuery, routing fields, Evidence.has_full_text"` - ---- - -## Task 2: Ask prompt files — create src/config/prompts/ask/ with 8 files - -**Files:** `src/config/prompts/ask/`(新建目录) - -| 文件 | 输入变量 | 输出 JSON 关键字段 | -|---|---|---| -| router.txt | {question} | route_type, reasoning | -| direct_answer.txt | {question} | answer, source, disclaimer | -| diag_step1.txt | {question} | clinical_features[], differential_diagnoses[](最多3个) | -| diag_step2.txt | {clinical_features}, {single_diagnosis} | EBMQuery 字段 | -| ebm_pico.txt | {question}, {backtrack_context} | EBMQuery(query_type=pico) | -| ebm_pird.txt | {question}, {backtrack_context} | EBMQuery(query_type=pird) | -| ebm_peo.txt | {question}, {backtrack_context} | EBMQuery(query_type=peo) | -| ebm_prognosis.txt | {question}, {backtrack_context} | EBMQuery(query_type=prognosis) | - -router.txt 的 direct_answer 触发条件须同时满足:(1) 要求立即操作性指导;(2) 延迟会危及生命;(3) 答案来自公认标准流程(BLS/ACLS)。 - -- [ ] 创建目录并写入 8 个文件,每个文件含 Role + 输入变量 + 输出 JSON 格式,不含示例数据 -- [ ] 验证所有文件存在且非空 -- [ ] `git commit -m "feat(ask-prompts): 8 routing prompt files"` - ---- - -## Task 3: ask_agent.py — rewrite with routing logic - -**Files:** `src/agents/ask_agent.py` - -``` -__init__: 从 src/config/prompts/ask/ 加载 8 个 prompt 到 self._prompts dict -execute(state): - route = _call("router", question) - if route == "direct_answer": return {direct_answer_output, _ask_direct_answer: True} - if route == "diagnostic_reasoning": - step1 = _call("diag_step1"); sub_queries = [_call("diag_step2", ...) for diag in differentials] - return {sub_pico_queries, ebm_query: sub_queries[0]} - ebm_dict = _call(route_type, question, backtrack_context) - return {ebm_query: EBMQuery(**ebm_dict), pico_query: PICOQuery(...), route_type} -``` - -- [ ] 重写 ask_agent.py -- [ ] 验证:`python3 -c "from src.agents.ask_agent import AskAgent; print('OK')"` -- [ ] `git commit -m "feat(ask-agent): routing with direct_answer/diag/ebm_* branches"` - ---- - -## Task 4: coordinator.py — direct_answer early termination - -**Files:** `src/coordinator/coordinator.py` - -```python -if agent_name == "Ask" and result.get("_ask_direct_answer"): - state.update(result); state["should_terminate"] = True; return state -``` - -- [ ] 在 execute_agent 的 result = agent.execute(state) 之后插入上述代码 -- [ ] 验证:`python3 -c "from src.coordinator.coordinator import Coordinator; print('OK')"` -- [ ] `git commit -m "feat(coordinator): early termination for direct_answer route"` - ---- - -## Task 5: pubmed_api.py — add fetch_pmc_full_text via PMC OA BioC JSON API - -**Files:** `src/tools/pubmed_api.py` - -``` -URL: https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmcid}/unicode -timeout=10,任何异常返回 None -解析 documents[0].passages[*].text,拼接返回 -``` - -- [ ] 在文件末尾追加 fetch_pmc_full_text 函数 -- [ ] 验证:`python3 -c "from src.tools.pubmed_api import fetch_pmc_full_text; print('OK')"` -- [ ] `git commit -m "feat(pubmed-api): add fetch_pmc_full_text via PMC OA BioC JSON"` - ---- - -## Task 6: acquire_agent.py — EBMQuery support + PMC full text + BM25/Embedding RAG - -**Files:** `src/agents/acquire_agent.py` - -- [ ] 新增 _FILTER_BY_ROUTE_TYPE(ebm_pico→HSSS, ebm_pird→DTA, ebm_peo/prognosis→OBSERVATIONAL) -- [ ] 新增线程安全 _get_embedding_model()(懒加载 all-MiniLM-L6-v2) -- [ ] 新增 _fetch_full_texts(candidates):ThreadPoolExecutor(max_workers=8),写入 evidence.full_text 和 evidence.has_full_text -- [ ] 新增 _rag_extract(evidence, query_terms):BM25 top-8 → Embedding rerank top-3,返回 (key_sentences, score) -- [ ] 更新 execute:读 ebm_query(优先)或 pico_query(兼容);fetch full texts;RAG extract;full_text 文章排前 -- [ ] 验证:`python3 -c "from src.agents.acquire_agent import AcquireAgent; print('OK')"` -- [ ] `git commit -m "feat(acquire): EBMQuery routing, PMC full-text, BM25+Embedding RAG"` - ---- - -## Task 7: appraise_agent.py — fix _compute_grade: SR dynamic initial score, upgrade blocked by SERIOUS bias, cap observational at Moderate - -**Files:** `src/agents/appraise_agent.py`, `tests/test_appraise_grade.py` - -测试用例(5个):SR+RCT→High;SR+OBSERVATIONAL→Low;COHORT+SERIOUS+全升级→Very Low(阻断);COHORT+NOT_SERIOUS+全升级→Moderate(cap);CROSS_SECTIONAL+全升级→Low(不在升级类型中) - -``` -_SR_INITIAL_POINTS = {"RCT": 4, "OBSERVATIONAL": 2, "MIXED": 3, "UNKNOWN": 3} -_SR_TYPES = {"SYSTEMATIC_REVIEW", "META_ANALYSIS", "NMA"} -_UPGRADE_STUDY_TYPES = {"COHORT", "CASE_CONTROL"} -``` - -- [ ] 先写测试,确认 FAIL -- [ ] 重写 _compute_grade:SR 查 _SR_INITIAL_POINTS;升级仅对 _UPGRADE_STUDY_TYPES 且 NOT_SERIOUS 时生效;升级后 cap at min(points, 3) -- [ ] 确认测试 PASS -- [ ] `git commit -m "feat(appraise): dynamic SR initial score, upgrade blocked by bias, cap at Moderate"` - ---- - -## Task 8: appraise_agent.txt — add included_study_type, confounding_bias_mitigates fields - -**Files:** `src/config/prompts/appraise_agent.txt` - -- [ ] 在"研究类型"节末尾新增 included_study_type 说明(仅 SR/MA/NMA 填写;取值 RCT/OBSERVATIONAL/MIXED/UNKNOWN) -- [ ] 在"升级因素"节新增 confounding_bias_mitigates(YES/NO/NA)和 upgrade_blocked_by_bias(true/false);注明 SERIOUS 偏倚时升级被阻断 -- [ ] 在 JSON 输出模板中新增这两个字段 -- [ ] 验证:`python3 -c "t=open('src/config/prompts/appraise_agent.txt').read(); assert 'included_study_type' in t; print('OK')"` -- [ ] `git commit -m "feat(prompt/appraise): included_study_type, confounding_bias_mitigates"` - ---- - -## Task 9: apply_agent.py — route_type injection, structured GRADE, inconsistency enforcement - -**Files:** `src/agents/apply_agent.py`, `tests/test_apply_agent.py` - -测试用例(4个):_format_ebm_query pico 含 "Intervention:";pird 含 "Index Test:";_summarize_downgrade_factors 全 NOT_SERIOUS→固定字符串;有 SERIOUS→含因素名 - -- [ ] 新增模块级函数:_format_ebm_query, _format_pico_query, _summarize_downgrade_factors -- [ ] 更新 execute:注入 route_type/query_description/key_downgrade_factors/has_serious_inconsistency -- [ ] 强制规则:overall_grade in (Very Low, Low) + LLM 给 Strong → 降为 Weak;has_serious_inconsistency + Strong → 降为 Weak -- [ ] route_confidence == "low" 时在 caveats 追加警告 -- [ ] 先写测试确认 FAIL,修改代码确认 PASS -- [ ] `git commit -m "feat(apply): route_type injection, structured GRADE, inconsistency enforcement"` - ---- - -## Task 10: apply_agent.txt — add route_type dimension check, structured GRADE input variables - -**Files:** `src/config/prompts/apply_agent.txt` - -- [ ] 新增输入变量:{route_type}, {query_description}, {overall_grade}, {downgrade_factors}, {consistency_flag} -- [ ] 在 prompt 开头新增 Step 0:根据 {route_type} 说明当前问题框架(治疗/诊断/病因/预后) -- [ ] 在推荐强度规则中明确写入:SERIOUS inconsistency → 不得给 Strong -- [ ] 验证:`python3 -c "t=open('src/config/prompts/apply_agent.txt').read(); assert '{route_type}' in t; print('OK')"` -- [ ] `git commit -m "feat(prompt/apply): route_type dimension check, structured GRADE input"` - ---- - -## Task 11: ask_judge.txt — rewrite Gate+Rubrics - -**Files:** `src/config/prompts/judge/ask_judge.txt` - -Gate(任一触发 → 直接 FAIL):intent_distorted == YES;keywords_english_medical == NO - -| 维度 | 权重 | -|---|---| -| pico_completeness(P/I/O 均 YES) | 0.30 | -| keyword_quality(MeSH + 同义词) | 0.25 | -| route_correctness(route_type 与问题匹配) | 0.25 | -| clarity(表述清晰度) | 0.20 | - -输出 JSON 新增:gate_passed: bool, rubric_scores: {...}, weighted_score: float - -- [ ] 重写文件 -- [ ] `git commit -m "feat(judge/ask): Gate+Rubrics architecture"` - ---- - -## Task 12: acquire_judge.txt — rewrite Gate+Rubrics - -**Files:** `src/config/prompts/judge/acquire_judge.txt` - -Gate:search_terms_valid == NO - -| 维度 | 权重 | -|---|---| -| evidence_quality(best_study_type) | 0.35 | -| pico_match | 0.35 | -| selection_quality(listwise 合理性) | 0.30 | - -- [ ] 重写文件 -- [ ] `git commit -m "feat(judge/acquire): Gate+Rubrics architecture"` - ---- - -## Task 13: appraise_judge.txt — rewrite Gate+Rubrics - -**Files:** `src/config/prompts/judge/appraise_judge.txt` - -Gate:study_type_correct == NO - -| 维度 | 权重 | -|---|---| -| downgrade_factors(分类合理性) | 0.35 | -| computed_grade(合理性) | 0.35 | -| upgrade_factors(含 confounding_bias_mitigates 审计) | 0.30 | - -- [ ] 重写文件 -- [ ] `git commit -m "feat(judge/appraise): Gate+Rubrics architecture"` - ---- - -## Task 14: apply_judge.txt — rewrite Gate+Rubrics - -**Files:** `src/config/prompts/judge/apply_judge.txt` - -Gate:recommendation_based_on_evidence == NO - -| 维度 | 权重 | -|---|---| -| grounding(推荐-证据匹配) | 0.35 | -| strength_match(推荐强度 vs GRADE) | 0.35 | -| route_dimension(route_dimension_correct) | 0.15 | -| actionability(临床可操作性) | 0.15 | - -新增输入变量 {route_type},用于判断推荐是否符合当前问题框架。 - -- [ ] 重写文件 -- [ ] `git commit -m "feat(judge/apply): Gate+Rubrics, route_dimension audit"` - ---- - -## Task 15: assess_judge.txt — add route_type/route_confidence/ebm_query inputs, route_confidence_noted output field - -**Files:** `src/config/prompts/judge/assess_judge.txt` - -- [ ] 输入新增:{route_type}, {route_confidence}, {ebm_query_description} -- [ ] ask_to_acquire_link 审计新增:检索词是否覆盖 {route_type} 对应的关键维度 -- [ ] 新增审计项 route_confidence_noted(若 route_confidence=low,输出是否包含不确定性说明) -- [ ] 输出 JSON 新增 `route_confidence_noted: "YES | NO | NA"` -- [ ] `git commit -m "feat(judge/assess): route_type/ebm_query inputs, route_confidence_noted"` - ---- - -## Task 16: judge_llm.py — add _check_gates, _score_rubrics, RUBRIC_WEIGHTS; update _score_ask/acquire/appraise/apply - -**Files:** `src/judge/judge_llm.py`, `tests/test_judge_rubrics.py` - -```python -RUBRIC_WEIGHTS = { - "ask": {"pico_completeness": 0.30, "keyword_quality": 0.25, "route_correctness": 0.25, "clarity": 0.20}, - "acquire": {"evidence_quality": 0.35, "pico_match": 0.35, "selection_quality": 0.30}, - "appraise": {"downgrade_factors": 0.35, "computed_grade": 0.35, "upgrade_factors": 0.30}, - "apply": {"grounding": 0.35, "strength_match": 0.35, "route_dimension": 0.15, "actionability": 0.15}, -} -``` - -测试用例(5个):Gate 触发时 _score_ask 返回 0.0;Gate 通过时按权重正确计算;_check_gates("ask", {"intent_distorted": "YES"}) 返回 False;_check_gates("apply", {"recommendation_based_on_evidence": "NO"}) 返回 False;全 YES rubric_scores 返回 1.0 - -- [ ] 先写测试,确认 FAIL -- [ ] 新增 _check_gates, _score_rubrics;更新 _score_ask/acquire/appraise/apply;更新 _prepare_context 传入路由字段 -- [ ] 确认测试 PASS -- [ ] `git commit -m "feat(judge-llm): Gate+Rubrics scoring, RUBRIC_WEIGHTS"` - ---- - -## Task 17: Integration tests — tests/test_integration_routing.py (mock LLM, test direct_answer/ebm_pico/ebm_pird routing) - -**Files:** `tests/test_integration_routing.py`(新建) - -- [ ] direct_answer 路由 → should_terminate=True,direct_answer_output 非空 -- [ ] ebm_pico 路由 → ebm_query.query_type == "pico",pico_query 兼容字段存在 -- [ ] ebm_pird 路由 → ebm_query.query_type == "pird" -- [ ] 旧 pico_query 兼容 → Acquire 正常运行 -- [ ] `git commit -m "test(integration): routing flow with mock LLM"` - ---- - -## Task 18: Full regression — run all tests, fix failures - -- [ ] `python3 -m pytest tests/ -v --tb=short 2>&1 | tail -30` -- [ ] 确认无 FAILED -- [ ] 如有失败,逐一修复后重新运行 -- [ ] `git commit -m "chore: all tests passing after 4/20-4/22 redesign"` diff --git a/docs/superpowers/plans/2026-04-22-judge-rubrics-implementation.md b/docs/superpowers/plans/2026-04-22-judge-rubrics-implementation.md deleted file mode 100644 index 914249b..0000000 --- a/docs/superpowers/plans/2026-04-22-judge-rubrics-implementation.md +++ /dev/null @@ -1,1276 +0,0 @@ -# Judge Rubrics 重设计实现计划 - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** 将 Ask/Acquire/Appraise/Apply 四个阶段的 Judge 改造为 Gate + Weighted Rubrics 架构,使评分逻辑对 LLM 和人工标注者均透明可验证。 - -**Architecture:** LLM Judge 输出每条 rubric 的 YES/PARTIAL/NO;Python 侧先做 Gate 检查(任一 NO → 立即 fail),再按 Critical=3/Major=2/Minor=1 权重计算总分,≥0.7 → pass。决策模型读取 gate 失败项和低分 rubric 群生成定向 retry 指令。 - -**Tech Stack:** Python 3.10+, LangChain LLM, pytest - ---- - -## 文件改动清单 - -| 文件 | 操作 | 说明 | -|---|---|---| -| `src/config/prompts/judge/ask_judge.txt` | 重写 | Gate + rubric 结构,动态路由段注入 | -| `src/config/prompts/judge/acquire_judge.txt` | 重写 | Gate + rubric 结构,keywords 评分迁入 | -| `src/config/prompts/judge/appraise_judge.txt` | 重写 | Gate + rubric 结构,新增升级因素审计 | -| `src/config/prompts/judge/apply_judge.txt` | 重写 | Gate + rubric 结构,route_dimension 审计 | -| `src/judge/judge_llm.py` | 修改 | `_score_*` 函数全部重写;新增 `_check_gates`;`STAGE_WEIGHTS` 替换为 rubric 权重表;Appraise 新增 Layer 1 Python 校验 | -| `tests/test_judge_rubrics.py` | 新建 | 各阶段 rubric 评分单元测试 | - ---- - -## Task 1: 重写 `ask_judge.txt` - -**Files:** -- Modify: `src/config/prompts/judge/ask_judge.txt` - -- [ ] **Step 1: 写入新 prompt** - -完整替换 `ask_judge.txt` 内容为以下内容(注意 JSON 输出示例中的双花括号是 Python format 转义,实际文件写单花括号): - -``` -# Role -你是一个严格的EBM审计员。对 Ask Agent 的输出进行客观分类判断,只输出结构化 JSON,不要打分。 - -# Input -原始问题:{original_question} -路由类型:{route_type} -Ask Agent 输出:{stage_output} - -# 一票否决项(Gate) -以下任一项为 NO 时,整体判定为 gate_fail,无需继续评分。 - -## G1. intent_not_distorted -结构化结果是否忠实反映原问题意图(方向性:人群、问题类型)? -- YES:意图一致 -- NO:方向性错误(问儿童→写成人;问治疗→写诊断) - -## G2. route_correct(仅当 route_type != direct_answer 时判断,否则填 NA) -route_type 与问题类型是否匹配? -- YES:匹配 -- NO:明显错误(如诊断准确性问题路由为 ebm_pico) -- NA:route_type = direct_answer,不适用 - -## G3. nonresearch_classification_correct(仅当 route_type = direct_answer 时判断,否则填 NA) -以下三条触发条件是否全部满足? -1. 问题要求立即操作性指导(动词:如何处理/立即给/紧急处置) -2. 延迟回答会直接危及患者生命安全 -3. 答案来自已有公认标准流程(BLS/ACLS/指南操作章节) -- YES:三条均满足 -- NO:任一条不满足(应重路由到 EBM 流程) -- NA:route_type != direct_answer,不适用 - -# Rubric 评分项(仅适用于 EBM 路由;direct_answer 路由时所有 rubric 填 NA) - -## R1. core_dimensions_present【Critical,权重3】 -P + 主焦点维度(ebm_pico→I;ebm_pird→IndexTest;ebm_peo→Exposure;ebm_prognosis→PF)+ O 是否均有实质内容? -- YES:三个核心维度均有实质内容 -- PARTIAL:三者中有一个描述极度模糊(如 O="outcomes")但方向正确 -- NO:任一核心维度完全缺失或填写错误 - -## R2. secondary_dimensions_present【Major,权重2】 -次要维度(ebm_pico→C;ebm_pird→R;ebm_prognosis→TH;ebm_peo 无次要维度填 NA)是否按路由要求填写?原问题未涉及的填 NA。 -- YES:次要维度填写正确,或原问题未涉及时正确填 NA -- PARTIAL:次要维度有轻微偏差但不影响检索方向 -- NO:次要维度明显错误(如 PIRD 的 R 字段填了干预措施) -- NA:ebm_peo 路由(无次要维度) - -## R3. statement_unambiguous【Minor,权重1】 -结构化表述是否无歧义,可直接用于检索? -- YES:表述明确,无歧义 -- PARTIAL:有轻微歧义但不影响检索方向 -- NO:严重歧义,检索方向不确定 - -# Output Format -仅输出以下 JSON,不要包含任何其他文本: - -{ - "gate_results": { - "intent_not_distorted": "YES | NO", - "route_correct": "YES | NO | NA", - "nonresearch_classification_correct": "YES | NO | NA" - }, - "rubric_results": { - "core_dimensions_present": "YES | PARTIAL | NO | NA", - "secondary_dimensions_present": "YES | PARTIAL | NO | NA", - "statement_unambiguous": "YES | PARTIAL | NO | NA" - }, - "failures": ["具体失败项及原因(无失败则为空列表)"], - "overall_quality": "pass | fail | gate_fail" -} -``` - -- [ ] **Step 2: 验证格式** - -```bash -python3 -c " -from pathlib import Path -txt = Path('src/config/prompts/judge/ask_judge.txt').read_text() -assert '{original_question}' in txt -assert '{route_type}' in txt -assert '{stage_output}' in txt -assert 'gate_results' in txt -assert 'rubric_results' in txt -print('ask_judge.txt OK') -" -``` - ---- - -## Task 2: 重写 `acquire_judge.txt` - -**Files:** -- Modify: `src/config/prompts/judge/acquire_judge.txt` - -- [ ] **Step 1: 写入新 prompt** - -完整替换 `acquire_judge.txt` 内容为: - -``` -# Role -你是一个严格的EBM审计员。对 Acquire Agent 的输出进行客观分类判断,只输出结构化 JSON,不要打分。 - -# 核心EBM原则 -证据质量 ≠ 证据数量。1篇Cochrane系统评价 > 10篇RCT > 100篇病例报告。 - -# Input -路由类型:{route_type} -结构化查询:{ebm_query} -Acquire Agent 输出(已排序的证据列表):{stage_output} - -# 预处理:系统错误检测 -首先检查输入数据中是否包含 error 字段(如 "error": "Connection timeout"): -如果存在 error 字段,说明 PubMed API 调用本身失败,与检索词无关。 -此时跳过所有审计项,直接输出:search_terms_valid=YES,所有 rubric 填 NA,search_exhausted=false,failures=[],overall_quality=pass。 - -# 一票否决项(Gate) - -## G1. search_terms_valid -检索词方向是否正确,能对应到查询的核心概念? -- YES:检索词方向正确 -- NO:检索词方向完全错误(如问心衰治疗却检索肾功能指标) - -# Rubric 评分项 - -各 route_type 对应的主焦点维度: -- ebm_pico:Intervention -- ebm_pird:Index Test -- ebm_peo:Exposure -- ebm_prognosis:Prognostic Factor - -## R1. keywords_cover_pico_dimensions【Critical,权重3】 -关键词是否覆盖 P + 主焦点维度,且至少含一个可在 MeSH 验证的标准词? -- YES:覆盖 P + 主焦点维度,且含 MeSH 标准词 -- PARTIAL:覆盖了 P 或主焦点之一,但另一维度无对应关键词;或有覆盖但无 MeSH 标准词 -- NO:关键词全部指向同一概念,未覆盖多个维度 - -## R2. primary_focus_match【Critical,权重3】 -基于证据列表中主焦点匹配度最好的那篇证据判断:证据中的核心干预/暴露/测试是否与查询主焦点维度匹配? -- YES:精准匹配 -- PARTIAL:同类方法但有差异(不同剂量/版本),相关性高 -- NO:完全不同的测试/干预/暴露 - -## R3. outcome_match【Critical,权重3】 -基于证据列表中结局匹配度最好的那篇证据判断:证据是否报告了临床关心的结局指标? -- YES:报告了直接结局指标 -- PARTIAL:报告了代理指标或部分相关结局 -- NO:未报告任何相关结局 - -## R4. keywords_have_synonyms【Major,权重2】 -核心概念是否有同义词/变体(如 SGLT2i + empagliflozin + dapagliflozin)? -- YES:有同义词/变体 -- PARTIAL:有部分同义词但不完整 -- NO:无任何同义词扩展,仅有单一术语 - -## R5. keywords_count_sufficient【Major,权重2】 -关键词数量是否充足? -- YES:≥ 5 个 -- PARTIAL:3-4 个 -- NO:≤ 2 个 - -## R6. study_design_matches_route【Major,权重2】 -纳入文献的研究设计是否与 route_type 的优先级匹配? -匹配表: -- ebm_pico:第一优先级=SR/Meta分析(基于RCT),第二=RCT,第三=观察性研究,排除=机制综述/专家意见/病例报告 -- ebm_pird:第一优先级=SR/Meta分析(基于诊断准确性研究),第二=诊断准确性研究(横断面),第三=回顾性诊断研究,排除=机制综述/治疗类RCT -- ebm_peo:第一优先级=SR/Meta分析(基于观察性研究),第二=前瞻性队列,第三=病例对照,排除=RCT/机制综述 -- ebm_prognosis:第一优先级=SR/Meta分析(基于队列研究),第二=前瞻性队列,第三=回顾性队列,排除=机制综述/病例报告 -- YES:有第一优先级文献 -- PARTIAL:有次优先级文献但无第一优先级,或混入少量不匹配设计 -- NO:大量纳入与 route_type 不匹配的研究设计 - -## R7. population_match【Major,权重2】 -基于证据列表中人群匹配度最好的那篇证据判断:研究人群是否与查询 Patient 匹配? -- YES:精准匹配(相同年龄段、相同疾病状态) -- PARTIAL:有轻微差异,结论可审慎外推 -- NO:严重不匹配(成人证据用于儿科;完全不同疾病) - -## R8. top_selection_appropriate【Minor,权重1】 -排名靠前的文献(排名第1-3位)是否确实是列表中最优的证据选择? -- YES:排名前列的文献研究层级高且匹配度好 -- PARTIAL:总体合理,但有个别文献位置不最优 -- NO:排名顺序明显不合理(如病例报告排在SR/RCT前面) - -## R9. selection_count_appropriate【Minor,权重1】 -选取数量是否合理? -- YES:数量与候选质量相符 -- PARTIAL:数量略多或略少,但整体可接受 -- NO:明显不合理(大量高质量候选却只选1-2篇,或质量极差仍凑满10篇) - -## R10. key_sentences_present【Minor,权重1】 -Top 文章的 key_sentences 字段是否有实质内容? -- YES:Top 文章的 key_sentences 非空,RAG 流程正常执行 -- PARTIAL:部分文章 key_sentences 为空(摘要极短导致 chunk 失败) -- NO:所有文章 key_sentences 均为空,RAG 流程可能失败 - -# Output Format -仅输出以下 JSON,不要包含任何其他文本: - -{ - "gate_results": { - "search_terms_valid": "YES | NO" - }, - "rubric_results": { - "keywords_cover_pico_dimensions": "YES | PARTIAL | NO", - "primary_focus_match": "YES | PARTIAL | NO", - "outcome_match": "YES | PARTIAL | NO", - "keywords_have_synonyms": "YES | PARTIAL | NO", - "keywords_count_sufficient": "YES | PARTIAL | NO", - "study_design_matches_route": "YES | PARTIAL | NO", - "population_match": "YES | PARTIAL | NO", - "top_selection_appropriate": "YES | PARTIAL | NO", - "selection_count_appropriate": "YES | PARTIAL | NO", - "key_sentences_present": "YES | PARTIAL | NO" - }, - "search_exhausted": false, - "failures": ["具体失败项及原因(无失败则为空列表)"], - "overall_quality": "pass | fail | gate_fail" -} -``` - -- [ ] **Step 2: 验证格式** - -```bash -python3 -c " -from pathlib import Path -txt = Path('src/config/prompts/judge/acquire_judge.txt').read_text() -assert '{route_type}' in txt -assert '{ebm_query}' in txt -assert '{stage_output}' in txt -assert 'gate_results' in txt -assert 'rubric_results' in txt -assert 'search_exhausted' in txt -print('acquire_judge.txt OK') -" -``` - ---- - -## Task 3: 重写 `appraise_judge.txt` - -**Files:** -- Modify: `src/config/prompts/judge/appraise_judge.txt` - -- [ ] **Step 1: 写入新 prompt** - -完整替换 `appraise_judge.txt` 内容为: - -``` -# Role -你是一个严格的EBM审计员。对 Appraise Agent 的GRADE评价进行客观分类判断,只输出结构化 JSON,不要打分。 - -# 背景说明 -Appraise Agent 输出结构化的GRADE分类标签(study_type、risk_of_bias等),最终GRADE等级由系统代码根据这些标签自动计算。你的审计重点是: -1. LLM对研究类型(study_type)的识别是否正确 -2. 各降级/升级因素的分类是否合理 -3. 系统计算出的GRADE等级(computed_grade)是否与你的独立判断一致 - -# Input -证据列表:{evidence_list} -Appraise Agent 输出(包含分类标签和计算结果):{stage_output} - -# 一票否决项(Gate) - -## G1. study_type_correct -所有研究的 study_type 识别是否正确? -- YES:所有研究的 study_type 识别正确 -- NO:存在明显错误(如将观察性研究标记为RCT) - -## G2. computed_grade_reasonable -系统计算出的最终GRADE等级(computed_grade)是否合理? -- YES:计算结果与基于摘要的独立判断一致 -- NO:明显不合理(通常是 study_type 或降级因素错误导致) - -注意:以下情况属于合理结果,不应判断为 NO: -- SR/MA 纳入观察性研究(included_study_type=OBSERVATIONAL)→ 初始分为 Low,即使无降级因素也可能输出 Low/Very Low -- COHORT/CASE_CONTROL 存在 SERIOUS 偏倚时,即使 large_effect=YES 也不升级 -- CROSS_SECTIONAL 无升级因素 → 最高只能到 Low - -# Rubric 评分项 - -## R1. downgrade_factors_appropriate【Critical,权重3】 -四个降级因素(risk_of_bias/inconsistency/indirectness/imprecision)的严重程度标注是否与摘要信息相符? -- YES:各因素的严重程度标签(NOT_SERIOUS/SERIOUS/VERY_SERIOUS)与摘要信息相符 -- PARTIAL:整体合理,但个别因素评估过于宽松或严苛 -- NO:存在明显错误(如未盲法 RCT 标记为 NOT_SERIOUS 偏倚风险) - -## R2. included_study_type_correct【Critical,权重3】 -(仅当证据列表含 SYSTEMATIC_REVIEW/META_ANALYSIS/NMA 时判断,否则填 NA) -SR/MA/NMA 的 included_study_type 字段是否与摘要描述的纳入研究类型相符? -- YES:字段与摘要描述的纳入研究类型相符(如摘要明确描述"纳入RCT"→ RCT) -- PARTIAL:摘要信息不足以确认(如摘要未描述纳入类型 → UNKNOWN 是合理选择) -- NO:明显错误(如摘要写"仅纳入RCT"但标注为 OBSERVATIONAL) -- NA:证据列表中没有 SR/MA/NMA 类型研究 - -## R3. upgrade_factors_appropriate【Major,权重2】 -(仅当证据列表含 COHORT/CASE_CONTROL 时判断,否则填 NA) -升级因素(large_effect/dose_response/confounding_bias_mitigates)的标注是否与摘要信息相符? -- YES:升级因素的 YES/NO 标注与摘要信息相符 -- PARTIAL:整体合理,个别因素有轻微偏差 -- NO:明显错误(如无明确剂量效应数据但标注 dose_response=YES) -- NA:证据列表中没有 COHORT/CASE_CONTROL 研究 - -## R4. upgrade_blocked_appropriate【Major,权重2】 -(仅当含 COHORT/CASE_CONTROL 且 risk_of_bias=SERIOUS/VERY_SERIOUS 时判断,否则填 NA) -存在严重偏倚风险时,升级因素是否被正确阻断(upgrade_blocked_by_bias=True)? -- YES:risk_of_bias=SERIOUS/VERY_SERIOUS 时,upgrade_blocked_by_bias 正确标注为 True,且最终等级未因升级因素提升 -- NO:存在严重偏倚但升级因素仍被计入 -- NA:无 COHORT/CASE_CONTROL 研究,或 risk_of_bias 均为 NOT_SERIOUS - -## R5. conflicts_identified【Major,权重2】 -证据间存在实质性冲突时,冲突是否被正确识别并描述? -- YES:所有主要冲突均被识别,conflict_description 描述准确;或证据间无冲突(正确标记为无冲突) -- PARTIAL:识别了主要冲突,但有遗漏或描述不够深入 -- NO:存在明显冲突但完全未识别 - -## R6. numerical_data_extracted【Minor,权重1】 -摘要中存在效应量/CI/P值时,是否均被提取? -- YES:data_available 的判断准确,能识别摘要中存在的数值指标 -- PARTIAL:判断基本合理,有轻微偏差 -- NO:摘要有明确效应量但标记为未提取 - -# Output Format -仅输出以下 JSON,不要包含任何其他文本: - -{ - "gate_results": { - "study_type_correct": "YES | NO", - "computed_grade_reasonable": "YES | NO" - }, - "rubric_results": { - "downgrade_factors_appropriate": "YES | PARTIAL | NO", - "included_study_type_correct": "YES | PARTIAL | NO | NA", - "upgrade_factors_appropriate": "YES | PARTIAL | NO | NA", - "upgrade_blocked_appropriate": "YES | NO | NA", - "conflicts_identified": "YES | PARTIAL | NO", - "numerical_data_extracted": "YES | PARTIAL | NO" - }, - "failures": ["具体失败项及原因(无失败则为空列表)"], - "overall_quality": "pass | fail | gate_fail" -} -``` - -- [ ] **Step 2: 验证格式** - -```bash -python3 -c " -from pathlib import Path -txt = Path('src/config/prompts/judge/appraise_judge.txt').read_text() -assert '{evidence_list}' in txt -assert '{stage_output}' in txt -assert 'gate_results' in txt -assert 'rubric_results' in txt -assert 'upgrade_blocked_appropriate' in txt -print('appraise_judge.txt OK') -" -``` - ---- - -## Task 4: 重写 `apply_judge.txt` - -**Files:** -- Modify: `src/config/prompts/judge/apply_judge.txt` - -- [ ] **Step 1: 写入新 prompt** - -完整替换 `apply_judge.txt` 内容为: - -``` -# Role -你是一个严格的EBM审计员。对 Apply Agent 生成的临床推荐进行客观分类判断,只输出结构化 JSON,不要打分。 - -# Input -路由类型:{route_type} -结构化查询:{query_description} -证据评价结果(来自Appraise阶段):{appraisal_results} -Apply Agent 输出(临床推荐):{stage_output} - -# 一票否决项(Gate) - -## G1. recommendation_grounded_in_evidence -推荐意见是否基于本次检索的证据,方向与证据一致? -- YES:推荐完全来源于提供的证据,方向一致 -- NO:推荐与证据无关或方向相反 - -## G2. route_dimension_consistent -Apply 的维度一致性检查是否使用了与 route_type 匹配的框架? -各 route_type 对应的正确框架: -- ebm_pico:Population / Intervention / Comparator / Outcome -- ebm_pird:Population / Index Test / Reference Standard / Target Condition -- ebm_peo:Population / Exposure / Outcome(无 Comparator) -- ebm_prognosis:Population / Prognostic Factor / Outcome / Time Horizon -- YES:维度框架与 route_type 匹配 -- NO:使用了错误框架(如 PIRD 问题用 PICO 框架,Index Test 被映射为 Intervention) - -## G3. strength_not_grossly_inflated -推荐强度是否未严重超出证据上限? -- YES:推荐强度在证据支持范围内 -- NO:Very Low 或 Low 证据给出 Strong 推荐,或有充分高质量证据却输出 No Recommendation - -# Rubric 评分项 - -## R1. effect_size_correctly_reported【Critical,权重3】 -效应量、置信区间、GRADE 等级是否被正确转述,无数据失真? -- YES:数值被正确转述,无失真 -- PARTIAL:数值基本正确,有轻微表述偏差但不影响结论方向 -- NO:效应量或 GRADE 等级被错误转述,导致结论方向改变 - -## R2. strength_matches_evidence【Critical,权重3】 -推荐强度是否与证据等级严格匹配? -注意:inconsistency=SERIOUS 时 Moderate→Weak 的降强推荐属正确行为,不应标注为不匹配。 -EBM原则:Strong需要High/Moderate直接证据;Weak适用于Low质量或结果不一致;Conditional适用于仅有间接证据;Consensus-based适用于仅有专家共识/指南。 -- YES:推荐强度与证据等级严格匹配(含上述特殊情况) -- PARTIAL:有轻微偏差(如 Moderate 证据给 Strong,但结果高度一致且无 inconsistency 问题),临床上可接受 -- NO:推荐强度与证据等级明显不符(不触发 gate 的中等程度不匹配) - -## R3. population_applicability_addressed【Major,权重2】 -是否明确说明了证据人群与当前患者的匹配程度,包括可外推性或外推限制? -- YES:明确说明了人群匹配程度和外推性 -- PARTIAL:有提及人群差异但说明不充分 -- NO:完全未讨论人群适配性 - -## R4. uncertainty_source_explained【Major,权重2】 -不确定性的来源是否被明确说明(如样本量不足、间接证据、研究设计局限)? -- YES:不确定性来源被明确说明 -- PARTIAL:提及了不确定性但未说明来源 -- NO:未提及不确定性,或仅说"证据有限"而无来源说明 - -## R5. citation_traceable【Major,权重2】 -推荐依据是否有文献溯源(PMID 或标题可追溯)? -- YES:推荐依据有文献溯源 -- PARTIAL:部分推荐有溯源,部分缺失 -- NO:无任何文献溯源 - -## R6. recommendation_specific【Minor,权重1】 -推荐内容是否足够具体,临床医生可据此执行(含适应症、关键参数等)? -- YES:推荐包含关键细节,临床医生可直接执行 -- PARTIAL:推荐方向明确但缺少关键细节 -- NO:推荐过于模糊,无法指导临床决策 - -## R7. patient_preference_considered【Minor,权重1】 -患者偏好或价值观是否被纳入推荐表述(或明确说明不适用)? -- YES:患者偏好被纳入,或明确说明不适用 -- PARTIAL:有提及但表述笼统 -- NO:完全未提及患者偏好 - -# Output Format -仅输出以下 JSON,不要包含任何其他文本: - -{ - "gate_results": { - "recommendation_grounded_in_evidence": "YES | NO", - "route_dimension_consistent": "YES | NO", - "strength_not_grossly_inflated": "YES | NO" - }, - "rubric_results": { - "effect_size_correctly_reported": "YES | PARTIAL | NO", - "strength_matches_evidence": "YES | PARTIAL | NO", - "population_applicability_addressed": "YES | PARTIAL | NO", - "uncertainty_source_explained": "YES | PARTIAL | NO", - "citation_traceable": "YES | PARTIAL | NO", - "recommendation_specific": "YES | PARTIAL | NO", - "patient_preference_considered": "YES | PARTIAL | NO" - }, - "failures": ["具体失败项及原因(无失败则为空列表)"], - "overall_quality": "pass | fail | gate_fail" -} -``` - -- [ ] **Step 2: 验证格式** - -```bash -python3 -c " -from pathlib import Path -txt = Path('src/config/prompts/judge/apply_judge.txt').read_text() -assert '{route_type}' in txt -assert '{query_description}' in txt -assert '{appraisal_results}' in txt -assert '{stage_output}' in txt -assert 'gate_results' in txt -assert 'rubric_results' in txt -assert 'route_dimension_consistent' in txt -print('apply_judge.txt OK') -" -``` - ---- - -## Task 5: 重写 `judge_llm.py` — 核心评分架构 - -**Files:** -- Modify: `src/judge/judge_llm.py` - -- [ ] **Step 1: 替换 STAGE_WEIGHTS 为 rubric 权重表** - -在 `judge_llm.py` 顶部,将 `STAGE_WEIGHTS` 替换为: - -```python -# Rubric weight definitions per stage -# Each rubric: (weight, allows_partial) -# Gate items are not listed here — they are checked separately in _check_gates() -RUBRIC_WEIGHTS = { - "Ask": { - "core_dimensions_present": (3, True), # Critical - "secondary_dimensions_present": (2, True), # Major - "statement_unambiguous": (1, True), # Minor - }, - "Acquire": { - "keywords_cover_pico_dimensions": (3, True), - "primary_focus_match": (3, True), - "outcome_match": (3, True), - "keywords_have_synonyms": (2, True), - "keywords_count_sufficient": (2, True), - "study_design_matches_route": (2, True), - "population_match": (2, True), - "top_selection_appropriate": (1, True), - "selection_count_appropriate": (1, True), - "key_sentences_present": (1, True), - }, - "Appraise": { - "downgrade_factors_appropriate": (3, True), - "included_study_type_correct": (3, True), - "upgrade_factors_appropriate": (2, True), - "upgrade_blocked_appropriate": (2, False), # only YES/NO/NA - "conflicts_identified": (2, True), - "numerical_data_extracted": (1, True), - }, - "Apply": { - "effect_size_correctly_reported": (3, True), - "strength_matches_evidence": (3, True), - "population_applicability_addressed":(2, True), - "uncertainty_source_explained": (2, True), - "citation_traceable": (2, True), - "recommendation_specific": (1, True), - "patient_preference_considered": (1, True), - }, -} - -PASS_THRESHOLD = 0.7 -``` - -- [ ] **Step 2: 新增 `_check_gates` 函数** - -在 `RUBRIC_WEIGHTS` 定义后添加: - -```python -def _check_gates(stage: str, audit: dict) -> list: - """ - Check gate items for a stage. Returns list of failed gate names. - Any gate failure means overall fail regardless of rubric scores. - """ - gate_results = audit.get("gate_results", {}) - failed = [] - - if stage == "Ask": - if gate_results.get("intent_not_distorted") == "NO": - failed.append("intent_not_distorted") - if gate_results.get("route_correct") == "NO": - failed.append("route_correct") - if gate_results.get("nonresearch_classification_correct") == "NO": - failed.append("nonresearch_classification_correct") - - elif stage == "Acquire": - if gate_results.get("search_terms_valid") == "NO": - failed.append("search_terms_valid") - - elif stage == "Appraise": - if gate_results.get("study_type_correct") == "NO": - failed.append("study_type_correct") - if gate_results.get("computed_grade_reasonable") == "NO": - failed.append("computed_grade_reasonable") - - elif stage == "Apply": - if gate_results.get("recommendation_grounded_in_evidence") == "NO": - failed.append("recommendation_grounded_in_evidence") - if gate_results.get("route_dimension_consistent") == "NO": - failed.append("route_dimension_consistent") - if gate_results.get("strength_not_grossly_inflated") == "NO": - failed.append("strength_not_grossly_inflated") - - return failed -``` - -- [ ] **Step 3: 新增 `_score_rubrics` 函数** - -```python -def _score_rubrics(stage: str, audit: dict) -> tuple: - """ - Score rubric items using weighted rubric system. - Returns (dimension_scores, raw_issues, total_score). - NA items are excluded from denominator. - YES = full weight, PARTIAL = weight * 0.5, NO = 0. - """ - rubric_weights = RUBRIC_WEIGHTS.get(stage, {}) - rubric_results = audit.get("rubric_results", {}) - issues = [] - total_score = 0.0 - total_max = 0.0 - dimension_scores = {} - - for rubric_name, (weight, allows_partial) in rubric_weights.items(): - val = rubric_results.get(rubric_name, "NA") - if val == "NA": - dimension_scores[rubric_name] = None # excluded - continue - - if val == "YES": - score = float(weight) - elif val == "PARTIAL" and allows_partial: - score = weight * 0.5 - else: # NO or PARTIAL on non-partial rubric - score = 0.0 - - total_score += score - total_max += weight - dimension_scores[rubric_name] = score / weight # normalize to 0-1 for display - - if val == "NO": - severity = "critical" if weight == 3 else "major" if weight == 2 else "minor" - issues.append({ - "severity": severity, - "dimension": rubric_name, - "description": f"{rubric_name} 未通过(NO)", - }) - elif val == "PARTIAL": - severity = "major" if weight >= 2 else "minor" - issues.append({ - "severity": severity, - "dimension": rubric_name, - "description": f"{rubric_name} 部分通过(PARTIAL)", - }) - - overall = total_score / total_max if total_max > 0 else 1.0 - return dimension_scores, issues, overall -``` - -- [ ] **Step 4: 重写 `_score_ask`** - -```python -def _score_ask(audit: dict) -> tuple: - gate_failures = _check_gates("Ask", audit) - if gate_failures: - issues = [{"severity": "critical", "dimension": g, - "description": f"Gate 失败: {g}"} for g in gate_failures] - return {"core_dimensions_present": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" - - # direct_answer: gate passed means classification correct → terminate signal - gate_results = audit.get("gate_results", {}) - if gate_results.get("nonresearch_classification_correct") == "YES": - return {"nonresearch": 1.0}, [], False, "direct_answer路由正确,触发terminate" - - dim_scores, issues, overall = _score_rubrics("Ask", audit) - pass_threshold = overall >= PASS_THRESHOLD - failures = audit.get("failures", []) - return dim_scores, issues, False, "; ".join(failures) if failures else f"综合评分: {overall:.2f}" -``` - -- [ ] **Step 5: 重写 `_score_acquire`** - -```python -def _score_acquire(audit: dict) -> tuple: - search_exhausted = bool(audit.get("search_exhausted", False)) - if search_exhausted: - return {"search_exhausted": 1.0}, [], True, "检索穷尽,标记evidence_gap" - - gate_failures = _check_gates("Acquire", audit) - if gate_failures: - issues = [{"severity": "critical", "dimension": g, - "description": f"Gate 失败: {g}"} for g in gate_failures] - return {"search_terms_valid": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" - - dim_scores, issues, overall = _score_rubrics("Acquire", audit) - failures = audit.get("failures", []) - return dim_scores, issues, False, "; ".join(failures) if failures else f"综合评分: {overall:.2f}" -``` - -- [ ] **Step 6: 重写 `_score_appraise`** - -```python -def _score_appraise(audit: dict) -> tuple: - gate_failures = _check_gates("Appraise", audit) - if gate_failures: - issues = [{"severity": "critical", "dimension": g, - "description": f"Gate 失败: {g}"} for g in gate_failures] - return {"study_type_correct": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" - - dim_scores, issues, overall = _score_rubrics("Appraise", audit) - failures = audit.get("failures", []) - return dim_scores, issues, False, "; ".join(failures) if failures else f"综合评分: {overall:.2f}" -``` - -- [ ] **Step 7: 重写 `_score_apply`** - -```python -def _score_apply(audit: dict) -> tuple: - gate_failures = _check_gates("Apply", audit) - if gate_failures: - issues = [{"severity": "critical", "dimension": g, - "description": f"Gate 失败: {g}"} for g in gate_failures] - return {"recommendation_grounded_in_evidence": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" - - dim_scores, issues, overall = _score_rubrics("Apply", audit) - failures = audit.get("failures", []) - return dim_scores, issues, False, "; ".join(failures) if failures else f"综合评分: {overall:.2f}" -``` - -- [ ] **Step 8: 更新 `_calculate_overall_score` 以兼容新 rubric 体系** - -新的 `_score_*` 函数直接返回 overall score,`_calculate_overall_score` 只在 Assess 阶段(未改动)使用。在 `evaluate_stage` 中,对 Ask/Acquire/Appraise/Apply 阶段,overall_score 从 `_score_rubrics` 直接取得,不再走 `STAGE_WEIGHTS` 加权。 - -在 `evaluate_stage` 中修改评分计算段: - -```python -dimension_scores, raw_issues, search_exhausted, reasoning_hint = scorer(audit) - -# For rubric-based stages, compute overall from rubric scores directly -if stage in ("Ask", "Acquire", "Appraise", "Apply"): - gate_failures = _check_gates(stage, audit) - if gate_failures: - overall_score = 0.0 - else: - _, _, overall_score = _score_rubrics(stage, audit) - # Clamp NA-only edge case - overall_score = max(0.0, min(1.0, overall_score)) -else: - overall_score = self._calculate_overall_score(stage, dimension_scores) -``` - -- [ ] **Step 9: 更新 `_prepare_context` 中 Acquire 和 Apply 的字段注入** - -Acquire 阶段:将 `pico_query` 替换为 `ebm_query` + `route_type`: - -```python -elif stage == "Acquire": - ebm_query = state.get("ebm_query") - pico = state.get("pico_query") - if ebm_query: - context["route_type"] = state.get("route_type", "ebm_pico") - context["ebm_query"] = json.dumps({ - "patient": ebm_query.patient, - "primary_focus": ebm_query.primary_focus, - "comparator": getattr(ebm_query, "comparator", None), - "outcome": ebm_query.outcome, - "keywords": ebm_query.keywords, - }, ensure_ascii=False, indent=2) - elif pico: - context["route_type"] = "ebm_pico" - context["ebm_query"] = json.dumps({ - "patient": pico.patient, - "primary_focus": pico.intervention, - "comparator": pico.comparison, - "outcome": pico.outcome, - "keywords": pico.keywords, - }, ensure_ascii=False, indent=2) - else: - context["route_type"] = "ebm_pico" - context["ebm_query"] = "N/A" -``` - -Apply 阶段:将 `pico_query` 替换为 `route_type` + `query_description`: - -```python -elif stage == "Apply": - context["route_type"] = state.get("route_type", "ebm_pico") - ebm_query = state.get("ebm_query") - pico = state.get("pico_query") - if ebm_query: - context["query_description"] = json.dumps({ - "patient": ebm_query.patient, - "primary_focus": ebm_query.primary_focus, - "outcome": ebm_query.outcome, - }, ensure_ascii=False, indent=2) - elif pico: - context["query_description"] = json.dumps({ - "patient": pico.patient, - "intervention": pico.intervention, - "comparison": pico.comparison, - "outcome": pico.outcome, - }, ensure_ascii=False, indent=2) - else: - context["query_description"] = "N/A" - # appraisal_results 注入保持不变 - appraisal = state.get("appraisal_results") - if appraisal: - context["appraisal_results"] = json.dumps({ - "evidence_count": len(appraisal.evidence), - "has_conflict": appraisal.has_conflict, - "summary": appraisal.summary, - }, ensure_ascii=False, indent=2) - else: - context["appraisal_results"] = "N/A" -``` - -Ask 阶段:新增 `route_type` 注入: - -```python -if stage == "Ask": - context["original_question"] = state["original_question"] - context["route_type"] = state.get("route_type", "unknown") -``` - -- [ ] **Step 10: 运行 lint 检查** - -```bash -python3 -m ruff check src/judge/judge_llm.py -``` - -Expected: no errors (or only pre-existing warnings unrelated to this change). - ---- - -## Task 6: 新增 Appraise Layer 1 Python 校验 - -**Files:** -- Modify: `src/judge/judge_llm.py` - -- [ ] **Step 1: 新增 `_appraise_layer1_check` 函数** - -在 `judge_llm.py` 中添加: - -```python -def _appraise_layer1_check(output: dict) -> dict: - """ - Layer 1 Python hardcoded validation for Appraise stage. - Returns dict with keys: passed (bool), failures (list[str]). - If passed=True, skip LLM Judge entirely. - Raises SystemError if grade_output_in_legal_range fails. - """ - LEGAL_GRADES = {"High", "Moderate", "Low", "Very Low"} - failures = [] - - appraisal = output.get("appraisal_results") - if appraisal is None: - failures.append("appraisal_results missing") - return {"passed": False, "failures": failures} - - from dataclasses import asdict, is_dataclass - appraisal_d = asdict(appraisal) if is_dataclass(appraisal) else appraisal - evidence_list = appraisal_d.get("evidence", []) - - LEGAL_STUDY_TYPES = { - "RCT", "COHORT", "CASE_CONTROL", "CASE_REPORT", - "SYSTEMATIC_REVIEW", "META_ANALYSIS", "NMA", - "GUIDELINE", "CROSS_SECTIONAL", "NARRATIVE_REVIEW", "EXPERT_OPINION", - } - - for ev in evidence_list: - study_type = ev.get("study_type") - if not study_type or study_type not in LEGAL_STUDY_TYPES: - failures.append(f"study_type missing or illegal: pmid={ev.get('pmid','?')} study_type={study_type}") - - rob = ev.get("risk_of_bias") - if rob is None: - failures.append(f"risk_of_bias missing: pmid={ev.get('pmid','?')}") - - grade = ev.get("grade_level") - if grade and grade not in LEGAL_GRADES: - raise SystemError( - f"grade_output_in_legal_range FAILED: pmid={ev.get('pmid','?')} grade={grade}. " - "Illegal grade value — workflow terminated." - ) - - return {"passed": len(failures) == 0, "failures": failures} -``` - -- [ ] **Step 2: 在 `evaluate_stage` 中为 Appraise 阶段插入 Layer 1 前置检查** - -在 `evaluate_stage` 方法中,在 `prompt_template = self._load_prompt(stage)` 之前插入: - -```python -# Appraise Layer 1: Python hardcoded check before calling LLM Judge -if stage == "Appraise": - layer1 = _appraise_layer1_check(output) - if layer1["passed"]: - # All structural checks pass — skip LLM Judge, return pass directly - from src.state.schema import Issue as IssueSchema - evaluation = Evaluation( - overall_score=1.0, - dimension_scores={"layer1_structural": 1.0}, - pass_threshold=True, - issues=[], - summary="Layer 1 结构校验通过,跳过 LLM Judge", - search_exhausted=False, - ) - return ObserveSchema(stage=stage, output=output, evaluation=evaluation) - else: - print(f"[Appraise Layer1] 校验失败,触发 LLM Judge: {layer1['failures']}") -``` - -- [ ] **Step 3: 运行 lint 检查** - -```bash -python3 -m ruff check src/judge/judge_llm.py -``` - ---- - -## Task 7: 编写单元测试 - -**Files:** -- Create: `tests/test_judge_rubrics.py` - -- [ ] **Step 1: 创建测试文件** - -```python -"""Unit tests for Gate + Weighted Rubrics judge scoring.""" -import pytest -from src.judge.judge_llm import ( - _check_gates, - _score_rubrics, - _score_ask, - _score_acquire, - _score_appraise, - _score_apply, - _appraise_layer1_check, - RUBRIC_WEIGHTS, - PASS_THRESHOLD, -) - - -# ── _check_gates ───────────────────────────────────────────────────────────── - -def test_check_gates_ask_all_pass(): - audit = {"gate_results": { - "intent_not_distorted": "YES", - "route_correct": "YES", - "nonresearch_classification_correct": "NA", - }} - assert _check_gates("Ask", audit) == [] - - -def test_check_gates_ask_intent_fail(): - audit = {"gate_results": {"intent_not_distorted": "NO", "route_correct": "YES"}} - assert "intent_not_distorted" in _check_gates("Ask", audit) - - -def test_check_gates_ask_route_fail(): - audit = {"gate_results": {"intent_not_distorted": "YES", "route_correct": "NO"}} - assert "route_correct" in _check_gates("Ask", audit) - - -def test_check_gates_acquire_pass(): - audit = {"gate_results": {"search_terms_valid": "YES"}} - assert _check_gates("Acquire", audit) == [] - - -def test_check_gates_acquire_fail(): - audit = {"gate_results": {"search_terms_valid": "NO"}} - assert "search_terms_valid" in _check_gates("Acquire", audit) - - -def test_check_gates_appraise_study_type_fail(): - audit = {"gate_results": {"study_type_correct": "NO", "computed_grade_reasonable": "YES"}} - assert "study_type_correct" in _check_gates("Appraise", audit) - - -def test_check_gates_apply_all_fail(): - audit = {"gate_results": { - "recommendation_grounded_in_evidence": "NO", - "route_dimension_consistent": "NO", - "strength_not_grossly_inflated": "YES", - }} - failures = _check_gates("Apply", audit) - assert "recommendation_grounded_in_evidence" in failures - assert "route_dimension_consistent" in failures - assert "strength_not_grossly_inflated" not in failures - - -# ── _score_rubrics ──────────────────────────────────────────────────────────── - -def test_score_rubrics_ask_all_yes(): - audit = { - "gate_results": {"intent_not_distorted": "YES", "route_correct": "YES"}, - "rubric_results": { - "core_dimensions_present": "YES", - "secondary_dimensions_present": "YES", - "statement_unambiguous": "YES", - } - } - dim_scores, issues, overall = _score_rubrics("Ask", audit) - assert overall == pytest.approx(1.0) - assert issues == [] - - -def test_score_rubrics_ask_partial_critical(): - # core_dimensions_present=PARTIAL → 1.5/3, others YES - audit = { - "gate_results": {"intent_not_distorted": "YES", "route_correct": "YES"}, - "rubric_results": { - "core_dimensions_present": "PARTIAL", - "secondary_dimensions_present": "YES", - "statement_unambiguous": "YES", - } - } - dim_scores, issues, overall = _score_rubrics("Ask", audit) - # total_score = 1.5 + 2 + 1 = 4.5, total_max = 3+2+1 = 6 - assert overall == pytest.approx(4.5 / 6) - assert any(i["severity"] == "major" for i in issues) - - -def test_score_rubrics_ask_no_critical(): - audit = { - "gate_results": {"intent_not_distorted": "YES", "route_correct": "YES"}, - "rubric_results": { - "core_dimensions_present": "NO", - "secondary_dimensions_present": "YES", - "statement_unambiguous": "YES", - } - } - dim_scores, issues, overall = _score_rubrics("Ask", audit) - # total_score = 0 + 2 + 1 = 3, total_max = 6 - assert overall == pytest.approx(3.0 / 6) - assert any(i["severity"] == "critical" for i in issues) - - -def test_score_rubrics_na_excluded_from_denominator(): - # secondary_dimensions_present=NA → excluded - audit = { - "gate_results": {"intent_not_distorted": "YES", "route_correct": "YES"}, - "rubric_results": { - "core_dimensions_present": "YES", - "secondary_dimensions_present": "NA", - "statement_unambiguous": "YES", - } - } - dim_scores, issues, overall = _score_rubrics("Ask", audit) - # total_score = 3 + 1 = 4, total_max = 3+1 = 4 - assert overall == pytest.approx(1.0) - - -def test_score_rubrics_pass_threshold(): - # Acquire: all YES → overall=1.0 → pass - rubric_results = {k: "YES" for k in RUBRIC_WEIGHTS["Acquire"]} - audit = { - "gate_results": {"search_terms_valid": "YES"}, - "rubric_results": rubric_results, - } - _, _, overall = _score_rubrics("Acquire", audit) - assert overall >= PASS_THRESHOLD - - -# ── _score_ask gate path ────────────────────────────────────────────────────── - -def test_score_ask_gate_fail_returns_zero(): - audit = {"gate_results": {"intent_not_distorted": "NO", "route_correct": "YES"}} - dim_scores, issues, search_exhausted, hint = _score_ask(audit) - assert any(i["severity"] == "critical" for i in issues) - assert "intent_not_distorted" in hint - - -def test_score_ask_direct_answer_correct(): - audit = {"gate_results": { - "intent_not_distorted": "YES", - "route_correct": "NA", - "nonresearch_classification_correct": "YES", - }, "rubric_results": {}} - dim_scores, issues, search_exhausted, hint = _score_ask(audit) - assert "terminate" in hint or "direct_answer" in hint - - -# ── _score_acquire search_exhausted ────────────────────────────────────────── - -def test_score_acquire_search_exhausted(): - audit = {"search_exhausted": True, "gate_results": {}, "rubric_results": {}} - dim_scores, issues, search_exhausted, hint = _score_acquire(audit) - assert search_exhausted is True - - -# ── _appraise_layer1_check ──────────────────────────────────────────────────── - -def test_appraise_layer1_pass(): - from dataclasses import dataclass - from typing import Optional - - @dataclass - class FakeEvidence: - pmid: str - study_type: str - risk_of_bias: str - grade_level: Optional[str] = "Moderate" - - @dataclass - class FakeAppraisal: - evidence: list - has_conflict: bool = False - conflict_description: Optional[str] = None - summary: str = "" - - output = {"appraisal_results": FakeAppraisal(evidence=[ - FakeEvidence(pmid="123", study_type="RCT", risk_of_bias="NOT_SERIOUS"), - ])} - result = _appraise_layer1_check(output) - assert result["passed"] is True - - -def test_appraise_layer1_missing_study_type(): - from dataclasses import dataclass - from typing import Optional - - @dataclass - class FakeEvidence: - pmid: str - study_type: Optional[str] - risk_of_bias: str - grade_level: Optional[str] = None - - @dataclass - class FakeAppraisal: - evidence: list - has_conflict: bool = False - conflict_description: Optional[str] = None - summary: str = "" - - output = {"appraisal_results": FakeAppraisal(evidence=[ - FakeEvidence(pmid="456", study_type=None, risk_of_bias="NOT_SERIOUS"), - ])} - result = _appraise_layer1_check(output) - assert result["passed"] is False - assert any("study_type" in f for f in result["failures"]) - - -def test_appraise_layer1_illegal_grade_raises(): - from dataclasses import dataclass - from typing import Optional - - @dataclass - class FakeEvidence: - pmid: str - study_type: str - risk_of_bias: str - grade_level: Optional[str] - - @dataclass - class FakeAppraisal: - evidence: list - has_conflict: bool = False - conflict_description: Optional[str] = None - summary: str = "" - - output = {"appraisal_results": FakeAppraisal(evidence=[ - FakeEvidence(pmid="789", study_type="RCT", risk_of_bias="NOT_SERIOUS", grade_level="ILLEGAL"), - ])} - with pytest.raises(SystemError, match="grade_output_in_legal_range"): - _appraise_layer1_check(output) -``` - -- [ ] **Step 2: 运行测试** - -```bash -python3 -m pytest tests/test_judge_rubrics.py -v --tb=short -``` - -Expected: all tests pass. - -- [ ] **Step 3: 如有失败,修复后重跑** - -```bash -python3 -m pytest tests/test_judge_rubrics.py -v --tb=short -``` - ---- - -## Task 8: 端到端冒烟验证 - -**Files:** -- No file changes — validation only - -- [ ] **Step 1: 验证所有 prompt 文件格式占位符** - -```bash -python3 -c " -from pathlib import Path -stages = { - 'ask': ['{original_question}', '{route_type}', '{stage_output}'], - 'acquire': ['{route_type}', '{ebm_query}', '{stage_output}'], - 'appraise': ['{evidence_list}', '{stage_output}'], - 'apply': ['{route_type}', '{query_description}', '{appraisal_results}', '{stage_output}'], -} -for stage, placeholders in stages.items(): - txt = Path(f'src/config/prompts/judge/{stage}_judge.txt').read_text() - for p in placeholders: - assert p in txt, f'Missing {p} in {stage}_judge.txt' - assert 'gate_results' in txt - assert 'rubric_results' in txt - print(f'{stage}_judge.txt: OK') -print('All prompt files validated.') -" -``` - -- [ ] **Step 2: 验证 judge_llm.py 可导入** - -```bash -python3 -c " -from src.judge.judge_llm import ( - _check_gates, _score_rubrics, _score_ask, _score_acquire, - _score_appraise, _score_apply, _appraise_layer1_check, - RUBRIC_WEIGHTS, PASS_THRESHOLD -) -print('judge_llm.py imports OK') -print('Stages with rubrics:', list(RUBRIC_WEIGHTS.keys())) -" -``` - -- [ ] **Step 3: 运行完整测试套件** - -```bash -python3 -m pytest tests/ --tb=short -q || [ $? -eq 5 ] -``` - -Expected: all tests pass (exit 0 or 5 if no other tests collected). - -- [ ] **Step 4: 运行 lint** - -```bash -python3 -m ruff check src/judge/judge_llm.py src/config/prompts/ -``` - -Expected: no new errors. - ---- - -## 补充说明:`STAGE_SCORERS` 更新 - -Task 5 Step 7 完成后,需同步更新 `judge_llm.py` 中的 `STAGE_SCORERS` dispatch table,将新的 `_score_*` 函数签名对齐。 - -现有 `STAGE_SCORERS`(`judge_llm.py:826`): - -```python -STAGE_SCORERS = { - "Ask": _score_ask, - "Acquire": _score_acquire, - "Appraise": _score_appraise, - "Apply": _score_apply, - "Assess": _score_assess, -} -``` - -新的 `_score_ask/_score_acquire/_score_appraise/_score_apply` 签名与原来相同(均接受 `audit: dict`,返回 `(dim_scores, issues, search_exhausted, reasoning_hint)`),因此 `STAGE_SCORERS` 本身**无需修改**,dispatch 逻辑不变。 - -唯一需要注意的是 Task 5 Step 8 中 `evaluate_stage` 里 overall_score 的计算方式:对 Ask/Acquire/Appraise/Apply 阶段,在调用 `scorer(audit)` 之后,额外调用 `_score_rubrics(stage, audit)` 取得 overall_score,而不再走 `_calculate_overall_score(stage, dimension_scores)`。 - ---- diff --git a/docs/superpowers/specs/2026-03-16-opensource-quality-design.md b/docs/superpowers/specs/2026-03-16-opensource-quality-design.md deleted file mode 100644 index 20adbd9..0000000 --- a/docs/superpowers/specs/2026-03-16-opensource-quality-design.md +++ /dev/null @@ -1,141 +0,0 @@ -# Open-Source Quality Improvements — Design Spec - -**Date:** 2026-03-16 -**Scope:** Additive-only changes (zero modifications to existing `src/` code) -**Constraint:** Every change in this spec must be a new file, a non-breaking edit to a config file, or a git-tracked file move that is explicitly documented together with any README/link impact. - ---- - -## Problem Statement - -The project has a complete, working implementation but lacks the scaffolding expected of a public open-source project. The gaps fall into three priority tiers: - -- **P0 — Legal & security blockers**: No LICENSE file; `.env.example` contains real credentials and is gitignored (unavailable to contributors) -- **P1 — Contributor experience**: No CI/CD, no issue/PR templates, no CONTRIBUTING.md -- **P2 — Packaging & documentation hygiene**: No `pyproject.toml`; internal dev artifacts clutter root and `docs/` - ---- - -## P0 Changes - -### 1. `LICENSE` -- **Action:** Create `/LICENSE` with standard MIT text -- **Content:** Year 2026, placeholder `` -- **Why:** README declares MIT but there is no LICENSE file; legally all rights are reserved without it - -### 2. `.env.example` — sanitise then unblock -- **Action:** Two-step operation, ORDER IS CRITICAL: - 1. **Step 1 — Overwrite content first:** Replace the existing `.env.example` with placeholder-only content (see below). This must happen before any `.gitignore` change. - 2. **Step 2 — Remove from `.gitignore`:** Only after the file contains no real credentials, remove the `.env.example` line from `.gitignore` so the sanitised template enters version control. -- **Placeholder content:** - ```dotenv - LLM_BASE_URL=https://api.openai.com/v1 - LLM_API_KEY=your_api_key_here - LLM_MODEL=gpt-4 - PUBMED_EMAIL=your_email@example.com - # FAST_LLM_MODEL=gpt-3.5-turbo - ``` -- **Why two steps in this order:** If `.gitignore` is edited first, git immediately sees the file with real credentials as untracked and an accidental `git add .` would stage them. Always sanitise content first. - -### 3. `.gitignore` — tighten (combined with step 2 above, single edit) -- **Action:** In a single `.gitignore` edit (combined with step 2): - - Remove: `.env.example` line - - Remove: `QUICKSTART.md` line (see item 3b below) - - Add: `*.log` — the existing `logs/` entry covers the `logs/` directory but not `.log` files written to the project root (e.g. by CI runners); `*.log` fills that gap. `logs/` is intentionally left as-is. - - Skip `nul` — it is already present on line 18; do not add a duplicate -- **Note:** `COMPLETION_SUMMARY.md`, `IMPLEMENTATION_STATUS.md`, `description.md` are handled in P2 via `git mv`; no `.gitignore` change needed for them - -### 3b. `QUICKSTART.md` — unblock -- **Action:** Remove `QUICKSTART.md` from `.gitignore` so the file becomes tracked -- **Why:** README's Documentation table in both English and Chinese sections links to `QUICKSTART.md`, but the file is currently gitignored, making that link silently broken for anyone who clones the repo. The file exists on disk and contains useful quick-start content; it should simply be tracked. - ---- - -## P1 Changes - -### 4. GitHub Actions CI — `.github/workflows/ci.yml` -- **Trigger:** `push` and `pull_request` on `main` -- **Dependency strategy:** `torch` and `transformers` are heavy (~2–3 GB) and have no unit tests against them in this repo. The CI job installs from a separate `requirements-dev.txt` (created as part of this change) that contains only lightweight test/lint dependencies, NOT torch/transformers. This avoids runner timeouts and cache bloat. -- **`requirements-dev.txt` content:** Pinned versions copied verbatim from `requirements.txt`, minus `torch` and `transformers`: `langchain==0.1.0`, `langchain-openai==0.0.5`, `langgraph==0.0.20`, `requests==2.31.0`, `pytest==7.4.3`, `pytest-cov==4.1.0`, `pytest-mock==3.12.0`, `python-dotenv==1.0.0`. Using the same pinned versions is intentional — this guarantees CI uses exactly the same library behaviour as a local install. -- **Jobs:** `test` (pytest with `--tb=short`, using `requirements-dev.txt`) -- **Python version matrix:** 3.10 -- **Why:** Even with zero tests today, the CI scaffold is in place; contributors see a badge and the framework runs on PRs - -### 5. Issue Templates — `.github/ISSUE_TEMPLATE/` -- `bug_report.md` — fields: description, steps to reproduce, expected vs actual, environment (Python version, LLM provider, OS) -- `feature_request.md` — fields: problem statement, proposed solution, alternatives considered - -### 6. PR Template — `.github/PULL_REQUEST_TEMPLATE.md` -- Sections: Summary, Type of change (bug fix / feature / docs), Testing done, Checklist - -### 7. `CONTRIBUTING.md` -- Sections: Prerequisites, Local setup (`pip install -r requirements.txt` for full; `requirements-dev.txt` for test-only), Running tests (`pytest`), Code style, Commit conventions, PR process -- Links back to README for project overview - ---- - -## P2 Changes - -### 8. `pyproject.toml` -- **Action:** Create `/pyproject.toml` alongside existing `requirements.txt` (does not replace it) -- **Content:** `[project]` metadata (name `ebm5a`, version `0.1.0`, description, Python ≥3.10, dependencies mirroring requirements.txt), `[build-system]` using `setuptools` -- **Why:** Enables `pip install -e .` for local dev; makes the project importable without sys.path hacks - -### 9. Reorganise `docs/` -- **Scope:** Internal working-note files are moved to `docs/internal/`. `docs/superpowers/` (this spec and future specs) is explicitly **out of scope** — it is not an internal dev artifact but a design record that should remain in `docs/`. -- **Files to move via `git mv`:** - - `docs/acquire_agent_fix.md` → `docs/internal/acquire_agent_fix.md` - - `docs/mvp_implementation_complete.md` → `docs/internal/mvp_implementation_complete.md` - - `docs/analysis/` → `docs/internal/analysis/` - - `docs/plans/` → `docs/internal/plans/` -- **Root files to move via `git mv`:** - - `COMPLETION_SUMMARY.md` → `docs/internal/COMPLETION_SUMMARY.md` - - `IMPLEMENTATION_STATUS.md` → `docs/internal/IMPLEMENTATION_STATUS.md` - - `description.md` → `docs/internal/description.md` -- **Root files intentionally left in place:** `CHANGELOG.md` — this is a standard open-source convention file; it belongs in the project root alongside README and LICENSE and must NOT be moved. -- **README impact:** README references `docs/` in the Documentation table as "Architecture design". After the move, `docs/` will contain `docs/internal/`, `docs/superpowers/`, and the new `docs/architecture.md` stub. The README link still resolves to a valid directory. The README Documentation table must be updated to add a row for `docs/internal/` ("Internal development notes") to reflect the new structure. This README edit is included in the file change summary below. - -### 10. `docs/architecture.md` (stub) -- **Action:** Create `docs/architecture.md` as a minimal public-facing architecture overview -- **Content:** One-paragraph summary + links to the relevant README sections (How It Works, Project Structure) -- **Why:** README's Documentation table promises an architecture doc at `docs/`; this fulfils that promise - ---- - -## Out of Scope - -- Any changes to `src/` (agents, coordinator, tools, config, state) -- Any changes to `requirements.txt` pinned versions (dependency upgrade is a separate decision) -- Adding actual test cases (CI scaffold only; tests are left for a future iteration) -- Moving `docs/superpowers/` (this is a design record, not an internal dev artifact) - ---- - -## File Change Summary - -| File | Action | -|------|--------| -| `LICENSE` | **Create** | -| `.env.example` | **Edit** (overwrite with placeholders — must precede .gitignore edit) | -| `.gitignore` | **Edit** (remove `.env.example` and `QUICKSTART.md` lines; add `*.log`) | -| `requirements-dev.txt` | **Create** (lightweight deps for CI) | -| `.github/workflows/ci.yml` | **Create** | -| `.github/ISSUE_TEMPLATE/bug_report.md` | **Create** | -| `.github/ISSUE_TEMPLATE/feature_request.md` | **Create** | -| `.github/PULL_REQUEST_TEMPLATE.md` | **Create** | -| `CONTRIBUTING.md` | **Create** | -| `pyproject.toml` | **Create** | -| `docs/architecture.md` | **Create** (stub) | -| `docs/internal/` | **Create dir** | -| `docs/acquire_agent_fix.md` | **Move** → `docs/internal/` | -| `docs/mvp_implementation_complete.md` | **Move** → `docs/internal/` | -| `docs/analysis/` | **Move** → `docs/internal/analysis/` | -| `docs/plans/` | **Move** → `docs/internal/plans/` | -| `COMPLETION_SUMMARY.md` | **Move** → `docs/internal/` | -| `IMPLEMENTATION_STATUS.md` | **Move** → `docs/internal/` | -| `description.md` | **Move** → `docs/internal/` | -| `QUICKSTART.md` | **Unblock** (remove from `.gitignore`) | -| `CHANGELOG.md` | **Leave in place** (standard root-level open-source file; must NOT be moved) | -| `README.md` | **Edit** (update Documentation table to include `docs/internal/` row) | - -**Total: 10 new files, 4 edited files, 7 items moved, 1 unblocked. Zero `src/` changes.** diff --git a/docs/superpowers/specs/2026-03-20-obstetrics-evidence-db-design.md b/docs/superpowers/specs/2026-03-20-obstetrics-evidence-db-design.md deleted file mode 100644 index 024453f..0000000 --- a/docs/superpowers/specs/2026-03-20-obstetrics-evidence-db-design.md +++ /dev/null @@ -1,169 +0,0 @@ -# 产科本地证据库设计文档 - -**日期**: 2026-03-20 -**状态**: 待实现 - ---- - -## 背景与目标 - -当前系统通过 PubMed E-utilities API 实时检索文献,受限于: -- 只能获取 title + abstract,无全文 -- 网络延迟高,每次查询需 3 次 API 调用 -- 产科专科问题的检索精度依赖通用 Boolean 查询 - -目标:构建一个本地产科专科证据库,**替代** Acquire agent 中的 PubMed 检索流程,支持全文混合检索。 - ---- - -## 范围 - -- Demo 规模:~10 篇产科相关全文文献 -- 数据来源:PMC Open Access(合法、免费、有全文 XML) -- 检索方式:BM25 关键词 + 向量语义,RRF 融合 -- 集成方式:替代 `search_pubmed()`,对外接口保持兼容 - -**不在范围内**: -- 付费文献爬取 -- 与 PubMed 并行双路检索 -- 非产科问题的本地库支持 - ---- - -## 架构 - -### 组件 - -``` -scripts/build_obstetrics_db.py # 一次性建库脚本(爬取 + 解析 + 索引) -src/tools/local_evidence_db.py # 检索接口(供 AcquireAgent 调用) -data/obstetrics_db/ # 原始全文 XML + 解析后 JSON -data/obstetrics_chroma/ # ChromaDB 向量索引 -``` - -### 数据流 - -``` -build_obstetrics_db.py - └─ 1. 从 PMC 爬取产科文献全文 XML(PMCID 列表硬编码) - └─ 2. 解析 XML → 提取 title/abstract/full_text/pmid/pmcid/authors/date - └─ 3. 分块(chunk_size=512 tokens,overlap=64) - └─ 4. 生成 embedding(all-MiniLM-L6-v2)→ 存入 ChromaDB - └─ 5. 建 BM25 索引(rank_bm25 库)→ 序列化到 data/obstetrics_db/bm25.pkl - -local_evidence_db.py - └─ search(query, top_k=20) - ├─ BM25 检索 → top-N 候选 + BM25 分数 - ├─ 向量检索(ChromaDB)→ top-N 候选 + 余弦相似度 - └─ RRF 融合 → 返回 List[Evidence](与现有 schema 兼容) -``` - ---- - -## 数据模型 - -复用现有 `Evidence` dataclass,新增两个可选字段: - -```python -@dataclass -class Evidence: - # 现有字段(不变) - title: str - source: str - pmid: Optional[str] - abstract: str - relevance_score: float - study_type: Optional[str] - publication_date: Optional[str] - grade_level: Optional[str] - # 新增 - pmcid: Optional[str] = None # PMC 文章 ID - full_text: Optional[str] = None # 全文(仅本地库有) -``` - ---- - -## 检索算法 - -### BM25 -- 库:`rank_bm25` -- 索引粒度:文章级(title + abstract + full_text 拼接) -- 分词:简单空格分词(英文足够) - -### 向量检索 -- 模型:`sentence-transformers/all-MiniLM-L6-v2`(384 维,本地运行) -- 索引:ChromaDB persistent client -- 索引粒度:512-token 分块,检索后聚合回文章级 - -### RRF 融合 -``` -rrf_score(d) = Σ 1 / (k + rank_i(d)),k=60 -``` -两路各取 top-20,RRF 合并后返回 top_k 篇。 - ---- - -## AcquireAgent 集成 - -修改 `acquire_agent.py`: - -```python -# 新增导入 -from src.tools.local_evidence_db import search_local - -# execute() 中替换检索调用 -if self._use_local_db(question_type): - raw_results = search_local(query=base_query, top_k=20) -else: - raw_results = search_pubmed(query=filtered_query, max_results=20) -``` - -`_use_local_db()` 判断逻辑:当前 demo 阶段始终返回 `True`(后续可按 question_type 或配置开关控制)。 - ---- - -## 建库脚本设计 - -`scripts/build_obstetrics_db.py` 接受一个 PMCID 列表(硬编码 10 个产科相关文章),执行: - -1. `GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={pmcid}&retmode=xml` -2. 解析 JATS XML,提取结构化字段 -3. 全文分块 + embedding -4. 写入 ChromaDB 和 BM25 索引 - -幂等:重复运行不重复写入(按 pmcid 去重)。 - ---- - -## 依赖 - -新增 Python 包: -``` -rank-bm25 -chromadb -sentence-transformers -``` - ---- - -## Demo 文献列表(初始 10 篇) - -选取产科高影响力开放获取文献,覆盖: -- 妊娠期高血压/子痫前期 -- 妊娠期糖尿病 -- 产后出血 -- 早产 -- 剖宫产 vs 阴道分娩 - -具体 PMCID 在实现时确认(需验证 PMC OA 可访问性)。 - ---- - -## 风险与限制 - -| 风险 | 缓解 | -|------|------| -| PMC XML 结构不统一 | 解析器做容错,缺字段时降级到 abstract | -| embedding 模型首次下载慢 | 脚本提示用户,模型缓存到 `~/.cache/` | -| 10 篇样本召回率低 | 明确标注为 demo,后续扩库不改接口 | -| full_text 字段增大 Appraise prompt | Appraise agent 继续只用 abstract;full_text 仅用于检索阶段 | diff --git a/docs/superpowers/specs/2026-03-25-web-ui-improvements-design.md b/docs/superpowers/specs/2026-03-25-web-ui-improvements-design.md deleted file mode 100644 index 7619130..0000000 --- a/docs/superpowers/specs/2026-03-25-web-ui-improvements-design.md +++ /dev/null @@ -1,239 +0,0 @@ - -# Web UI Improvements Design - -**Date:** 2026-03-25 -**Project:** TrueTruth (formerly EBM 5A) -**Scope:** Five targeted improvements to the clinical decision support web interface, identified after initial user testing. - ---- - -## 1. History Panel (localStorage Persistence) - -### Problem -When a new question is submitted, the previous run's results are lost. There is no way to review a prior question without re-running it. - -### Design - -**Storage:** Completed workflow runs are serialized to `localStorage` under the key `truetruth_history`. Each entry stores: -```json -{ - "id": "", - "question": "...", - "timestamp": "ISO-8601", - "status": "completed | terminated | error", - "stages": { ... }, - "backtracks": [...], - "finalResult": { ... } -} -``` - -**Save trigger:** `saveToHistory()` is called in the Zustand store on three events: -- `WORKFLOW_COMPLETED` (status = `"completed"` if recommendation exists, `"terminated"` if null) -- `WORKFLOW_ERROR` (status = `"error"`) - -**Storage budget:** Raw `stages` data can be large (evidence lists, per-call logs). Before saving, strip the per-call `logs` arrays (available in the LogConsole during live runs but not needed in history replay) and truncate long text fields. Target: ≤ 100 KB per entry. History is capped at 20 entries; oldest is dropped when the limit is exceeded. If `JSON.stringify` of an entry exceeds 200 KB after pruning, the entry is saved without the `stages` field (summary-only mode, showing only the final result). - -**UI:** A collapsible left sidebar (~220px wide) lists past runs in reverse-chronological order. Each row shows: -- Question text (truncated to ~60 chars) -- Timestamp (relative: "2 hours ago") -- Status icon: ✓ Complete / ⚠ Terminated / ✗ Error - -Clicking a history entry loads the stored state into the Zustand store in read-only mode (workflow cannot be re-run from this view, but all stage details, judge scores, and the final recommendation are visible). A "← New Question" button returns to the live view. The sidebar can be toggled with a ☰ button; collapse state is preserved in `localStorage` as `truetruth_sidebar_open`. - -**Implementation files:** -- `web/frontend/src/store/workflowStore.js`: add `saveToHistory()` action (called on `WORKFLOW_COMPLETED` and `WORKFLOW_ERROR`), add `loadFromHistory(entry)` action, add `historyView: boolean` flag to distinguish read-only mode -- `web/frontend/src/App.jsx`: add `` to layout, adjust grid to `[sidebar] [main]` -- `web/frontend/src/components/HistorySidebar.jsx`: new component - ---- - -## 2. Contextual Tooltips and Stage Descriptions - -### Problem -Technical terms (FAST-PATH, CAVEATS, PICO, GRADE, Judge dimensions, Scheduling Decision, Backtrack) are opaque to users unfamiliar with the EBM 5A framework. Stage cards give no overview of what each step does. - -### Design - -**Tooltip component:** A small `` component renders a `ⓘ` icon that shows a popover on hover. To avoid clipping inside scrolling containers, the popover uses `position: fixed` (computed via a `getBoundingClientRect()` call on mount of the hover event) rather than `position: absolute`. This ensures the popover always appears above all layout constraints. The component uses a single `useState` for the computed position, with `onMouseEnter`/`onMouseLeave` event handlers. - -**Stage header descriptions:** Each stage card gets a one-line subtitle explaining its role: -- **Ask** — "将临床问题结构化为 PICO 格式,提取检索关键词" -- **Acquire** — "从证据库中检索相关文献段落,筛选最相关条目" -- **Appraise** — "评估证据质量和等级(GRADE),识别研究间冲突" -- **Apply** — "基于证据生成临床推荐意见及推荐强度" -- **Assess** — "自评推荐质量,识别证据缺口,决定是否需要回溯" - -**Term glossary (tooltip text):** - -| Term | Tooltip | -|------|---------| -| FAST-PATH | 当评分通过且无重大问题时,调度器自动跳过 LLM 决策直接进入下一阶段 | -| PICO | 临床问题四要素:P=患者/病症, I=干预措施, C=对照, O=结局指标 | -| GRADE | 证据质量分级体系:High→Moderate→Low→Very Low | -| Caveats | 使用本推荐意见时需注意的例外、限制或特殊情况 | -| Backtrack | 当前阶段质量不足时,系统回到更早阶段重新执行 | -| Scheduling Decision | LLM 根据 Judge 评分和问题特征决定下一步动作(继续/重试/回溯/终止)| -| Judge Score | 独立评估模块对当前阶段输出质量的打分,跨多个维度加权平均 | -| Workflow Quality | Assess agent 对整个 workflow 输出质量的自评分,独立于 Judge Score | - -Note: The "Workflow Quality" tooltip requires first adding a visible label "Workflow Quality (自评)" to the quality ring in `AssessOutput` (currently the ring has no label). This label is also required by Problem 4c. - -**Implementation files:** -- `web/frontend/src/components/InfoTooltip.jsx`: new component (position: fixed popover) -- `web/frontend/src/components/StageCard.jsx`: add stage subtitle + tooltips; add "Workflow Quality (自评)" label to `AssessOutput` quality ring -- `web/frontend/src/components/JudgeScorePanel.jsx`: tooltip on "Judge Evaluation" header and dimension names -- `web/frontend/src/components/DecisionBadge.jsx`: tooltip on "Scheduling Decision" -- `web/frontend/src/components/RecommendationPanel.jsx`: tooltip on "Caveats" -- `web/frontend/src/index.css`: tooltip/popover styles - ---- - -## 3. Span-Level Evidence Retrieval - -### Problem -The local evidence DB currently retrieves articles as evidence candidates and passes them to the Acquire agent. With a small database (10 articles), the agent is forced to select from the same 10 articles every time, leading to artificially high selection counts. Clinical evidence should cite specific passages, not whole articles. - -### Design - -**Input surface clarification:** The current `search_local()` in `local_evidence_db.py` aggregates ChromaDB chunk hits back to article level. The span extraction operates on the article **abstract** (250–400 words), not on raw chunks, since that is the text currently stored on each `Evidence` object. This avoids changes to the ChromaDB retrieval path. - -**Span extraction algorithm** (new function `_extract_spans(abstract_text, query_keywords)` in `local_evidence_db.py`): - -``` -1. Split abstract into sentences (on 。.!? boundaries) -2. Score each sentence: count of query_keywords it contains (case-insensitive) -3. threshold = 1 (at least one keyword match required; tune empirically) -4. Merge adjacent sentences that both score ≥ threshold into a single span -5. If ≥ 60% of sentences score ≥ threshold, return the full abstract as one span -6. Return top-3 spans ranked by (max sentence score in span), each capped at 200 chars -7. If no sentence scores ≥ threshold, return None (no span extracted) -``` - -**Evidence schema change** (`src/state/schema.py`): Add optional field: -```python -key_sentences: Optional[str] = None -``` -The `abstract` field is kept (retains the full abstract for context display). `full_text` remains excluded from all prompts. - -**`search_local()` change:** After building each `Evidence` object, call `_extract_spans(ev.abstract, query_keywords)` and assign the result to `ev.key_sentences`. The candidate pool remains article-level (10 articles); span extraction is a display/prompt enrichment step, not a re-ranking step. If span extraction yields no result for an article, the article is still returned (with `key_sentences=None`). - -**Acquire agent changes** (`src/agents/acquire_agent.py`): -1. In `_listwise_rank`, the candidate block currently uses `e.abstract[:150]`. Change to use `e.key_sentences if e.key_sentences else e.abstract[:150]`, so the LLM sees the extracted span instead of a truncated abstract when available. -2. Fix pre-existing latent NameError: in the `except` handler of the search step (~line 262), `filtered_query` is referenced but only assigned in the PubMed branch. Since `_use_local_db()` currently always returns True this is never triggered, but change `filtered_query` → `search_query_used` to match the variable that is actually assigned in the local DB branch. - -**Serializer change** (`web/backend/serializers.py`, function `serialize_evidence_list`): Include `key_sentences` in the serialized evidence dict (alongside `abstract`). This field is served in the `AGENT_COMPLETED` SSE event for the Acquire stage. - -**Judge change** (`src/judge/judge_llm.py`): Add `ev.pop("key_sentences", None)` alongside the existing `ev.pop("full_text", None)` in the **Appraise stage** serialization only (the loop over `appraisal_d["evidence"]`). The Acquire stage's condensed evidence block is built field-by-field and already excludes `key_sentences` by construction — no change needed there. - -**Frontend change** (`web/frontend/src/components/EvidenceTable.jsx`): When `key_sentences` is present, display it as the primary evidence text in a highlighted block (e.g., light blue background, left border accent). The existing `abstract_preview` field (200-char truncation, already sent by the serializer) serves as the collapsible "Context Preview" below. No full abstract is added to avoid payload bloat. When `key_sentences` is `null` or absent, `abstract_preview` is displayed directly as before (no highlighted block). - -**Internal dependency order within Problem 3:** -1. `schema.py` — add `key_sentences` field -2. `local_evidence_db.py` — add `_extract_spans`, update `search_local` -3. `acquire_agent.py` — update `_listwise_rank` candidate block -4. `judge_llm.py` — add `key_sentences` exclusion -5. `serializers.py` — include `key_sentences` in output -6. `EvidenceTable.jsx` — render `key_sentences` - -**Implementation files:** -- `src/state/schema.py` -- `src/tools/local_evidence_db.py` -- `src/agents/acquire_agent.py` (`_listwise_rank` candidate block only) -- `src/judge/judge_llm.py` -- `web/backend/serializers.py` -- `web/frontend/src/components/EvidenceTable.jsx` - ---- - -## 4. Display Confusion Fixes - -### 4a. Call Tab: Judge Pass vs. Scheduling Decision - -**Problem:** A Call tab showing ✓ (Judge passed) alongside a subsequent retry creates confusion — users expect ✓ to mean "this call succeeded overall." - -**Fix:** When `stage.calls.length > 1` (tabs are rendered), each tab displays two independent indicators: -- Left: Judge result — `✓` (green, `pass_threshold=true`) or `✗` (orange, false), or `·` (gray, judge not yet available) -- Right: Scheduling action icon — `→` proceed, `↺` retry, `↩` backtrack, `⚡` fastpath, or nothing if decision not yet received - -Example: Call 1 tab shows `✓ ↺` (judge passed but scheduler chose to retry). Call 2 shows `✓ →` (passed and proceeded). - -For single-call stages (no tabs rendered), no additional indicator is needed — the Judge Evaluation section below already shows the full score. - -**Implementation files:** -- `web/frontend/src/components/StageCard.jsx`: update call tab render logic - -### 4b. JudgeScorePanel: Score vs. Issues Explanation - -**Problem:** A high overall score (e.g., 0.85) alongside a list of issues seems contradictory. - -**Fix:** Add a one-line note below the score circle: "总分为各维度加权平均;Minor 问题不大幅影响分数,但仍列出供参考。" Sort issues by severity (Critical → Major → Minor) and show a severity count summary line (e.g., "1 Critical · 2 Minor") before the list. - -**Implementation files:** -- `web/frontend/src/components/JudgeScorePanel.jsx` - -### 4c. Assess Stage: Two Distinct Scores - -**Problem:** Assess stage shows two numerical scores with no clear distinction — the Assess agent's `quality_score` (self-assessment of the workflow) and the Judge's `overall_score` (evaluation of the Assess agent's work). - -**Fix:** Label them explicitly: -- `AssessOutput` in `StageCard.jsx`: quality ring labeled **"Workflow Quality (自评)"** -- `JudgeScorePanel` below the divider: already titled "Judge Evaluation" — ensure the header reads **"Judge Score (第三方评分)"** -- In `RecommendationPanel.jsx` (final banner): the "Quality Assessment" section also renders a quality ring — add the same "Workflow Quality (自评)" label there for consistency. - -**Implementation files:** -- `web/frontend/src/components/StageCard.jsx` (`AssessOutput` section) -- `web/frontend/src/components/JudgeScorePanel.jsx` (header label) -- `web/frontend/src/components/RecommendationPanel.jsx` (Quality Assessment label) - ---- - -## 5. Branding: TrueTruth - -### Problem -The title "EBM 5A" is internal project nomenclature, not a product name. The header is small relative to modern AI application standards. - -### Design - -**Name change:** All occurrences of "EBM 5A" → "TrueTruth" in the UI, including: -- `App.jsx` header text -- `web/frontend/index.html` `` tag -- `web/backend/app.py` `FastAPI(title=...)` parameter - -**Header redesign:** -``` -┌─────────────────────────────────────────────────────────────────┐ -│ TrueTruth [Complete] [Error] │ -│ AI-Powered Clinical Evidence Synthesis │ -└─────────────────────────────────────────────────────────────────┘ -``` -- "TrueTruth": 36px, bold, white -- Subtitle: 13px, muted gray, italic -- Status badges right-aligned -- Header height increases from ~48px to ~72px - -**Implementation files:** -- `web/frontend/src/App.jsx` -- `web/frontend/index.html` -- `web/backend/app.py` -- `web/frontend/src/index.css` (`.header` height and title font-size) - ---- - -## Implementation Order - -Problems 1–5 are independent at the problem level. Suggested sequence: - -1. **Problem 5** (Title/branding) — trivial -2. **Problem 4** (Display fixes) — frontend-only, low risk -3. **Problem 2** (Tooltips) — frontend-only, additive; depends on Problem 4c adding the "Workflow Quality" label first -4. **Problem 1** (History panel) — moderate complexity, localStorage only -5. **Problem 3** (Span retrieval) — backend + frontend, highest complexity; must follow the internal dependency order listed in that section - ---- - -## Out of Scope - -- Redesigning the overall layout or color scheme -- Multi-user / server-side session persistence -- Sentence-level re-indexing of the ChromaDB vector store (span extraction operates on abstracts at query time) -- Exporting results to PDF/Word (noted for future consideration) diff --git a/docs/superpowers/specs/2026-04-03-repo-usability-improvements-design.md b/docs/superpowers/specs/2026-04-03-repo-usability-improvements-design.md deleted file mode 100644 index e558b64..0000000 --- a/docs/superpowers/specs/2026-04-03-repo-usability-improvements-design.md +++ /dev/null @@ -1,338 +0,0 @@ -# Repo Usability Improvements Design - -**Date:** 2026-04-03 -**Status:** Approved -**Scope:** Developer experience, deployment, CI, documentation — no changes to business logic or agent code - ---- - -## 1. Motivation - -EBM 5A is a well-architected, research-grade system. The primary friction for new users and contributors is not code quality but discoverability and setup guidance. Specific gaps identified: - -- No Docker deployment or one-command startup -- Web UI entirely undocumented in README -- No CI pipeline (no build status, no automated test/lint gates) -- No unified command entry point (Makefile) -- No setup validation before first run -- No troubleshooting guide or glossary -- No issue/PR templates - -This spec covers all improvements in a single implementation pass (Approach A: comprehensive at once). - ---- - -## 2. File Change Summary - -### New files - -| File | Purpose | -|------|---------| -| `Dockerfile.backend` | Python/FastAPI container (non-root user) | -| `Dockerfile.frontend` | Multi-stage Node build + Nginx static server (non-root user) | -| `nginx.conf` | Nginx config: `/api` reverse proxy + SPA `try_files` routing | -| `docker-compose.yml` | Production one-command full-stack startup | -| `docker-compose.dev.yml` | Development override (source mounts, hot reload) | -| `.dockerignore` | Exclude logs, cache, .env, __pycache__ from build context | -| `Makefile` | Unified command entry point | -| `scripts/check_env.py` | .env validation script | -| `.github/workflows/ci.yml` | GitHub Actions CI (lint + test + docker-build, with caching) | -| `.github/ISSUE_TEMPLATE/bug_report.md` | Bug report template | -| `.github/ISSUE_TEMPLATE/feature_request.md` | Feature request template | -| `.github/pull_request_template.md` | PR template | -| `docs/troubleshooting.md` | Common errors and fixes | -| `docs/glossary.md` | GRADE, PICO, recommendation strengths, Judge score | - -### Modified files - -| File | Change | -|------|--------| -| `README.md` | Add badges, Web UI section, Docker quick start, screenshot placeholder | -| `QUICKSTART.md` | Add Docker startup instructions | -| `.env.example` | Audit and sync all required env vars with current codebase | -| `.gitignore` | Ensure `data/cache/`, `logs/`, `.env` are excluded | - -### Unchanged - -- All `src/` agent/coordinator/judge/scheduling code -- All `web/` frontend and backend code -- `docs/internal/` (historical docs preserved as-is) - ---- - -## 3. Docker Deployment - -### Architecture - -Two services defined in `docker-compose.yml`: - -``` -backend — Python 3.11-slim, runs FastAPI via uvicorn on port 8000 -frontend — Multi-stage: node:20-alpine builds Vite bundle, nginx:alpine serves static files on port 80 -``` - -No third service in this pass (Redis/queue deferred to future). - -### Key decisions - -- **`.env` injected via `env_file`, never COPY'd into image** — prevents accidental secret leakage in image layers -- **`data/cache/` and `logs/` mounted as named volumes** — PubMed cache and run logs persist across container restarts -- **Nginx reverse-proxies `/api` → `backend:8000`** — eliminates CORS issues; frontend uses a single origin -- **Frontend `VITE_API_URL` set to `/api` at build time** — aligns with Nginx proxy path -- **`docker-compose.dev.yml` override** — mounts `src/` and `web/` as volumes for hot reload during development -- **Non-root users in both containers** — backend runs as `appuser` (UID 1000), frontend Nginx runs as `nginx` user; limits blast radius of any RCE vulnerability -- **`nginx.conf` includes SPA routing** — `try_files $uri $uri/ /index.html` ensures React Router routes (e.g. `/history`) don't 404 on browser refresh - -### User experience - -```bash -cp .env.example .env # fill in API key and PubMed email -make docker-up # docker compose up --build -d -# open http://localhost -``` - -### Dockerfile details - -**`Dockerfile.backend`** -- Base: `python:3.11-slim` -- Install `requirements.txt` then `requirements-web.txt` -- Working directory: `/app` -- Create non-root user: `RUN useradd -m -u 1000 appuser && chown -R appuser /app` -- Switch to non-root: `USER appuser` -- Entrypoint: `uvicorn web.backend.app:app --host 0.0.0.0 --port 8000` - -**`Dockerfile.frontend`** -- Stage 1 (build): `node:20-alpine`, runs `npm ci && npm run build` -- Stage 2 (serve): `nginx:alpine`, copies build output to `/usr/share/nginx/html` -- Copies `nginx.conf` into `/etc/nginx/conf.d/default.conf` -- Nginx runs as the built-in `nginx` user (non-root by default in `nginx:alpine`) - -**`nginx.conf`** -```nginx -server { - listen 80; - - # SPA routing: React Router client-side routes must not 404 on refresh - location / { - root /usr/share/nginx/html; - try_files $uri $uri/ /index.html; - } - - # Reverse proxy to FastAPI backend - location /api/ { - proxy_pass http://backend:8000/; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - } -} -``` - ---- - -## 4. Makefile - -`make` with no arguments prints help (default target). All commands have inline comments that are auto-parsed into the help output. - -### Commands - -```makefile -# Development -make dev-backend # uvicorn with --reload -make dev-frontend # npm run dev -make dev # both in background (via & or screen) - -# Docker -make docker-up # docker compose up --build -d -make docker-down # docker compose down -make docker-logs # docker compose logs -f - -# Quality -make test # pytest -make lint # ruff check src/ web/backend/ -make format # ruff format src/ web/backend/ - -# Utilities -make check-env # python scripts/check_env.py -make cli QUERY="..." # python -m src.main "$(QUERY)" - -# Help -make help # list all commands (default) -``` - ---- - -## 5. GitHub Actions CI - -**File:** `.github/workflows/ci.yml` - -**Triggers:** push to `main`, pull_request targeting `main` - -### Jobs - -``` -lint ──────────────────────────────────────────────┐ - ├── docker-build -test ──────────────────────────────────────────────┘ -``` - -- `lint` and `test` run in parallel -- `docker-build` runs after both pass (needs: [lint, test]) - -### Job details - -**lint** -- Python 3.11 -- Cache: `actions/cache` on `~/.cache/pip` keyed by `requirements.txt` hash -- `pip install ruff` -- `ruff check src/ web/backend/` -- `ruff format --check src/ web/backend/` - -**test** -- Python 3.11 -- Cache: `actions/cache` on `~/.cache/pip` keyed by `requirements.txt` + `requirements-web.txt` hash -- `pip install -r requirements.txt -r requirements-web.txt` -- Environment: `LLM_API_KEY=ci-placeholder-not-real`, `PUBMED_EMAIL=ci@example.com` - - **Policy:** CI never uses real API keys. The placeholder value is intentionally non-functional. Real keys must never appear in YAML or GitHub Secrets for this job — doing so would send live (costly) requests on every PR. -- `pytest tests/ --tb=short; STATUS=$?; [ $STATUS -eq 5 ] && exit 0 || exit $STATUS` -- Handles pytest exit code 5 ("no tests collected") gracefully — CI passes if tests/ is empty or has no test files; real failures (exit code 1) still fail CI - -**docker-build** -- Cache: Docker layer cache via `cache-from: type=gha` and `cache-to: type=gha,mode=max` (GitHub Actions cache backend) -- `docker compose build` (no run, no real API keys needed) -- Validates Dockerfiles and compose config are valid - -### README badges - -```markdown -[![CI](https://github.com/USER/ebm5a/actions/workflows/ci.yml/badge.svg)](https://github.com/USER/ebm5a/actions/workflows/ci.yml) -[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) -[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/) -``` - -Note: `USER` placeholder must be replaced with actual GitHub username when repo is known. - ---- - -## 6. Setup Validation Script - -**File:** `scripts/check_env.py` - -Checks performed in order: - -1. `.env` file exists in project root -2. `LLM_API_KEY` is set and non-empty -3. `LLM_API_BASE` is set and reachable (HTTP HEAD, 5s timeout) -4. `PUBMED_EMAIL` is set and matches basic email format -5. Python version >= 3.10 -6. Core packages importable: `langchain`, `torch`, `fastapi`, `uvicorn` -7. `FAST_LLM_MODEL` set (optional — prints advisory if missing, not failure) - -Output format: -``` -[✓] .env file found -[✓] LLM_API_KEY is set -[✓] LLM_API_BASE reachable (200 OK) -[✓] PUBMED_EMAIL format valid -[✓] Python 3.11.x >= 3.10 -[✓] Core dependencies installed -[~] FAST_LLM_MODEL not set (optional — Judge/Scheduling will use LLM_MODEL) - -All required checks passed. Ready to run. -``` - -On failure, each `[✗]` line includes a specific fix hint, e.g.: -``` -[✗] LLM_API_KEY not set → Add LLM_API_KEY=sk-... to your .env file -``` - -Exit code 0 on pass, 1 on any required check failure. - ---- - -## 7. Documentation - -### `docs/troubleshooting.md` - -Sections: -- **Setup errors** — missing .env, invalid API key, wrong LLM_API_BASE URL -- **PubMed issues** — rate limiting, unregistered email, network timeout -- **Runtime behavior** — why a run takes 2–10 minutes (normal), what backtrack events mean -- **Web UI issues** — CORS errors (use Docker or set VITE_API_URL), frontend blank page -- **Log interpretation** — what `[TIMING]`, `[FAST-PATH]`, `Judge score` lines mean - -### `docs/glossary.md` - -Terms defined: -- **5A Framework** — Ask, Acquire, Appraise, Apply, Assess -- **GRADE** — evidence quality levels: High, Moderate, Low, Very Low -- **PICO** — Patient, Intervention, Comparison, Outcome -- **Recommendation strength** — Strong, Conditional, Consensus-based, Insufficient Evidence -- **Judge score** — 0–1 quality gate, threshold 0.7 to proceed -- **ReAct loop** — Reasoning + Acting control loop with backtrack capability -- **Question types** — Therapy, Diagnosis, Prognosis, Harm, Prevention - -### `README.md` changes - -1. Add badges block at top (CI, License, Python version) -2. Add screenshot placeholder section: `docs/assets/screenshot.png` -3. New section **"Quick Start (Docker)"** with 3-command flow -4. New section **"Web UI"** with manual startup instructions (backend + frontend) -5. Reference `make check-env` in the setup steps -6. Link to `docs/troubleshooting.md` and `docs/glossary.md` - -### `QUICKSTART.md` changes - -Add Docker startup as the first (recommended) option before the existing manual steps. - ---- - -## 8. Issue and PR Templates - -### `.github/ISSUE_TEMPLATE/bug_report.md` - -Fields: -- Environment (OS, Python version, LLM provider, interface: CLI or Web UI) -- Steps to reproduce -- Expected vs actual behavior -- Relevant log excerpt (from `logs/` directory) - -### `.github/ISSUE_TEMPLATE/feature_request.md` - -Fields: -- Feature description -- Use case / motivation -- Possible implementation approach (optional) - -### `.github/pull_request_template.md` - -Fields: -- Summary of changes -- Related issue (Closes #xxx) -- How to test -- Checklist: lint passes, tests pass, docs updated if needed - ---- - -## 9. Implementation Order - -Tasks can be executed largely in parallel except where noted: - -1. **Dockerfiles + docker-compose** (foundation — other docs reference these commands) -2. **Makefile** (depends on knowing Docker commands from step 1) -3. **`scripts/check_env.py`** (independent) -4. **GitHub Actions CI** (references Makefile commands) -5. **`docs/troubleshooting.md`** (independent) -6. **`docs/glossary.md`** (independent) -7. **Issue/PR templates** (independent) -8. **README.md + QUICKSTART.md updates** (depends on steps 1–2 being final — references real commands) - ---- - -## 10. Out of Scope - -- Changes to `src/` agent, coordinator, judge, or scheduling logic -- Changes to `web/` frontend or backend feature code -- Redis or other infrastructure services -- Deploy/publish CI steps (push to registry, deploy to server) -- Cleaning up `docs/internal/` historical files -- Adding new tests (CI runs existing tests only) diff --git a/run_ab_test.sh b/run_ab_test.sh old mode 100755 new mode 100644 diff --git a/run_test.sh b/run_test.sh old mode 100755 new mode 100644 diff --git a/scripts/check_env.py b/scripts/check_env.py old mode 100755 new mode 100644 diff --git a/src/agents/acquire_agent.py b/src/agents/acquire_agent.py index d33a65b..56ee905 100644 --- a/src/agents/acquire_agent.py +++ b/src/agents/acquire_agent.py @@ -58,7 +58,7 @@ } # Number of top-K articles to select via listwise ranking. -_TOP_K = 10 +_TOP_K = 5 # --------------------------------------------------------------------------- # Lazy-loaded sentence-transformer for RAG reranking @@ -75,7 +75,14 @@ def _get_embedding_model(): if _embedding_model is None: try: from sentence_transformers import SentenceTransformer # noqa: PLC0415 - _embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + try: + # Use cached model without network check (avoids HuggingFace timeouts) + _embedding_model = SentenceTransformer( + "all-MiniLM-L6-v2", local_files_only=True + ) + except Exception: + # Model not cached yet — download it once + _embedding_model = SentenceTransformer("all-MiniLM-L6-v2") except Exception: _embedding_model = None # graceful degradation return _embedding_model @@ -95,10 +102,12 @@ class AcquireAgent(BaseAgent): 7. Full-text articles are promoted to the front of the ranked list. """ - def __init__(self, llm, tools: List[Any] = None): + def __init__(self, llm, ranking_llm=None, tools: List[Any] = None): super().__init__(llm=llm, tools=tools or [], agent_type="Acquire") self.prompt_template = self._load_prompt("acquire_agent.txt") self.ranking_prompt_template = self._load_prompt("acquire_ranking.txt") + # Listwise ranking is a classification/sorting task — fast model is sufficient. + self.ranking_llm = ranking_llm or llm def _load_prompt(self, filename: str) -> str: prompt_path = Path(__file__).parent.parent / "config" / "prompts" / filename @@ -132,12 +141,9 @@ def _extract_query(self, content: str) -> str: return content.strip() def _use_local_db(self, question_type: str = "Therapy") -> bool: - """Return True to route retrieval through the local obstetrics evidence DB. - - Demo phase: always True. Later this can be switched per question_type - or via an environment variable / config flag. - """ - return True + """Return True to route retrieval through the local obstetrics evidence DB.""" + import os + return os.getenv("USE_LOCAL_DB", "false").lower() == "true" def _apply_search_filter(self, query: str, question_type: str = "Therapy", route_type: str = "") -> str: """Wrap query with an appropriate filter based on route_type (preferred) or question_type.""" @@ -230,7 +236,29 @@ def _rag_extract( return " … ".join(top3), boost def _infer_study_type(self, evidence: Evidence) -> str: - """Infer study type from title and abstract using keyword rules.""" + """Infer study type from PubMed publication types (primary) then title/abstract keywords (fallback).""" + # --- Primary: PubMed pubtype metadata (authoritative, index-assigned) --- + pub_types = getattr(evidence, "pub_types", None) or [] + pt_lower = {pt.lower() for pt in pub_types} + if "meta-analysis" in pt_lower: + return "Systematic Review" + if "systematic review" in pt_lower: + return "Systematic Review" + if "randomized controlled trial" in pt_lower or "controlled clinical trial" in pt_lower: + return "RCT" + if "clinical trial" in pt_lower: + return "RCT" + if "observational study" in pt_lower or "cohort study" in pt_lower: + return "Cohort Study" + if "case-control study" in pt_lower or "case control study" in pt_lower: + return "Case-Control Study" + if "case reports" in pt_lower: + return "Case Report" + if "review" in pt_lower: + # "Review" pubtype without "Systematic Review" → narrative review + return "Narrative Review" + + # --- Fallback: keyword scan of title + abstract --- text = f"{evidence.title} {evidence.abstract or ''}".lower() if "systematic review" in text or "meta-analysis" in text: return "Systematic Review" @@ -239,7 +267,6 @@ def _infer_study_type(self, evidence: Evidence) -> str: or "randomised controlled trial" in text or "randomized clinical trial" in text or "randomised clinical trial" in text - or "rct" in text or " randomized " in text or " randomised " in text ): @@ -298,7 +325,7 @@ def _listwise_rank( candidates=candidate_text, ) - response = self.llm.invoke(prompt) + response = self.ranking_llm.invoke(prompt) print( f"[DEBUG] Listwise ranking response (first 300 chars): {response.content[:300]}" ) @@ -352,7 +379,7 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: "outcome": ebm_query.outcome, } query_keywords = ebm_query.keywords - route_type = ebm_query.query_type # e.g. "pico", "pird", "peo", "prognosis" + route_type = f"ebm_{ebm_query.query_type}" # e.g. "ebm_pico", "ebm_pird" else: pico_dict = { "patient": pico.patient, @@ -379,6 +406,11 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: keywords=", ".join(query_keywords), backtrack_context=backtrack_context, ) + # Split into system + user messages so the static prefix (role, worked + # example, instructions) gets prefix-cached by the gateway. See + # base.split_prompt_for_caching. + from src.agents.base import split_prompt_for_caching + prompt = split_prompt_for_caching(prompt) t0 = time.time() response = self.llm.invoke(prompt) print(f"[TIMING] Acquire query LLM: {time.time()-t0:.1f}s") diff --git a/src/agents/apply_agent.py b/src/agents/apply_agent.py index 17892e4..f5bda78 100644 --- a/src/agents/apply_agent.py +++ b/src/agents/apply_agent.py @@ -6,17 +6,29 @@ def _format_ebm_query(ebm_query: EBMQuery) -> str: """Format an EBMQuery into a concise human-readable description.""" - parts = [f"类型: {ebm_query.query_type}"] - parts.append(f"患者/人群: {ebm_query.patient}") - parts.append(f"主要关注点: {ebm_query.primary_focus}") - if ebm_query.comparator: - parts.append(f"对照: {ebm_query.comparator}") - if ebm_query.reference_standard: - parts.append(f"参考标准: {ebm_query.reference_standard}") - parts.append(f"结局: {ebm_query.outcome}") - if ebm_query.time_horizon: - parts.append(f"时间范围: {ebm_query.time_horizon}") - return "; ".join(parts) + def _s(v: Any) -> str: + return str(v) if v is not None else "N/A" + + qt = ebm_query.query_type + if qt == "pico": + return (f"Patient: {_s(ebm_query.patient)} | " + f"Intervention: {_s(ebm_query.primary_focus)} | " + f"Comparator: {_s(ebm_query.comparator)} | " + f"Outcome: {_s(ebm_query.outcome)}") + elif qt == "pird": + return (f"Patient: {_s(ebm_query.patient)} | " + f"Index Test: {_s(ebm_query.primary_focus)} | " + f"Reference Standard: {_s(ebm_query.reference_standard)} | " + f"Target Condition: {_s(ebm_query.outcome)}") + elif qt == "peo": + return (f"Patient: {_s(ebm_query.patient)} | " + f"Exposure: {_s(ebm_query.primary_focus)} | " + f"Outcome: {_s(ebm_query.outcome)}") + else: # prognosis + return (f"Patient: {_s(ebm_query.patient)} | " + f"Prognostic Factor: {_s(ebm_query.primary_focus)} | " + f"Outcome: {_s(ebm_query.outcome)} | " + f"Time Horizon: {_s(ebm_query.time_horizon)}") def _format_pico_query(pico_query: PICOQuery) -> str: @@ -105,7 +117,9 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: evidence_summary = "\n\n".join( [ - f"Evidence {i+1}:\nTitle: {e.title}\nQuality: {e.grade_level}\nSource: {e.source}" + f"Evidence {i+1}:\nTitle: {e.title}\nGRADE: {e.grade_level}\n" + f"Study Type: {e.study_type or 'Unknown'}\n" + f"Key Findings:\n{e.key_sentences or e.abstract or '(无摘要)'}" for i, e in enumerate(appraisal.evidence) ] ) @@ -166,19 +180,15 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: retry_response = self.llm.invoke(retry_prompt) rec_dict = self._parse_json(retry_response.content) - # Determine overall evidence quality - grades = [e.grade_level for e in appraisal.evidence if e.grade_level] - if "High" in grades: - evidence_quality = "High" - elif "Moderate" in grades: - evidence_quality = "Moderate" - elif "Low" in grades: - evidence_quality = "Low" - else: + # evidence_quality is now determined by the LLM itself (see apply_agent.txt Step 4). + # The LLM reports the quality of evidence it actually adopted, not all retrieved evidence. + # Fallback to "Very Low" if the field is missing or unrecognised. + _valid_qualities = {"High", "Moderate", "Low", "Very Low"} + evidence_quality = rec_dict.get("evidence_quality", "") + if evidence_quality not in _valid_qualities: evidence_quality = "Very Low" - # GRADE enforcement: clamp strength to match evidence quality. - # LLM may override this despite prompt instructions, so we enforce in Python. + # Safety clamp: enforce hard GRADE rules the LLM may still violate. llm_strength = rec_dict.get("strength", "Weak") if evidence_quality in ("Very Low", "Low") and llm_strength == "Strong": strength = "Weak" diff --git a/src/agents/appraise_agent.py b/src/agents/appraise_agent.py index 21488ad..4d66008 100644 --- a/src/agents/appraise_agent.py +++ b/src/agents/appraise_agent.py @@ -122,6 +122,8 @@ def _compute_grade(appraisal: Dict) -> str: points += 1 if appraisal.get("dose_response") == "YES": points += 1 + if appraisal.get("confounding_bias_mitigates") == "YES": + points += 1 # Observational evidence cannot reach High (4) through upgrades alone points = min(points, 3) @@ -165,22 +167,30 @@ def _parse_json(self, content: str) -> dict: return robust_parse_json(content) def _format_evidence_list(self, evidence_list) -> str: - """Format evidence list for the prompt, including abstract preview.""" + """Format evidence list for the prompt, including full abstract and pub_types.""" parts = [] for i, e in enumerate(evidence_list): - abstract_preview = (getattr(e, "abstract", "") or "")[:200] + abstract = (getattr(e, "abstract", "") or "") study_type_hint = getattr(e, "study_type", "") or "" hint_str = ( f"\nSource DB study_type hint: {study_type_hint}" if study_type_hint else "" ) + # pub_types from PubMed metadata is authoritative for study design. + # Pass it explicitly so the Agent uses it instead of guessing from text. + pub_types = getattr(e, "pub_types", None) or [] + pub_types_str = ( + f"\nPubMed pub_types (authoritative): {', '.join(pub_types)}" + if pub_types + else "" + ) parts.append( f"Evidence {i + 1}:\n" f"Title: {e.title}\n" f"Source: {e.source}\n" - f"PMID: {e.pmid}{hint_str}\n" - f"Abstract (preview): {abstract_preview}" + f"PMID: {e.pmid}{hint_str}{pub_types_str}\n" + f"Abstract: {abstract}" ) return "\n\n".join(parts) @@ -253,7 +263,16 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: """Execute Appraise agent to classify GRADE factors and compute final grades.""" evidence_list = state.get("evidence_list") if not evidence_list: - raise ValueError("No evidence found in state") + # Graceful terminate — Coordinator should have caught this, but guard here too + state["should_terminate"] = True + state["backtrack_reason"] = "Appraise: evidence_list is empty, cannot proceed." + return { + "appraisal_results": None, + "grade_rationales": [], + "numerical_confidence": 0.0, + "numerical_data": {"data_available": "NO", "confidence_level": "VERY_LOW", "note": "No evidence available"}, + "bias_inconsistency": False, + } backtrack_context = "" if state.get("backtrack_reason"): @@ -317,6 +336,7 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: "evidence_id": i + 1, "title": evidence.title, "study_type": study_type, + "included_study_type": appraisal.get("included_study_type", "NA"), "initial_grade": initial_grade, "risk_of_bias": appraisal.get("risk_of_bias", "NOT_SERIOUS"), "inconsistency": appraisal.get("inconsistency", "NA"), @@ -325,6 +345,11 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: "publication_bias": appraisal.get("publication_bias", "UNDETECTED"), "large_effect": appraisal.get("large_effect", "NA"), "dose_response": appraisal.get("dose_response", "NA"), + "confounding_bias_mitigates": appraisal.get("confounding_bias_mitigates", "NA"), + "upgrade_blocked_by_bias": ( + study_type in _UPGRADE_STUDY_TYPES + and appraisal.get("risk_of_bias") in ("SERIOUS", "VERY_SERIOUS") + ), "computed_grade": computed_grade, "rationale": appraisal.get("rationale", ""), } diff --git a/src/agents/ask_agent.py b/src/agents/ask_agent.py index d9ccf6b..c40a952 100644 --- a/src/agents/ask_agent.py +++ b/src/agents/ask_agent.py @@ -1,12 +1,18 @@ """ AskAgent — clinical question triage and EBM query structuring. -Routing flow: - 1. Router prompt → route_type: "direct_answer" | "full_pipeline" | "sub_questions" +Routing flow (V2, post 2026-05-18 A/B validation): + 1. Unified router prompt → route_type + question_type + ebm_framework + + (for non-Diagnosis full_pipeline) structured query 2a. direct_answer → DirectAnswer prompt → populate direct_answer_output, set should_terminate - 2b. sub_questions → decompose into sub-question list, recurse on first sub-question - 2c. full_pipeline → framework-specific prompt (PICO / PIRD / PEO / Prognosis) - Diagnosis questions run diag_step1 → diag_step2 before PIRD + 2b. sub_questions → decompose into sub-question list, structure each via framework prompt + 2c. full_pipeline (non-Diagnosis) → use the query already produced by unified router + 2d. full_pipeline (Diagnosis) → diag_step1 → ebm_pird (V1 two-step flow retained) + +V2 merges router + framework structuring for the non-Diagnosis path, saving one +LLM call per Ask invocation. A/B experiment (6 cases × 3 repeats, 2026-05-18) +showed V2 quality ≥ V1 across all cases with avg latency reduction ~50% on +simple framework paths. """ from __future__ import annotations @@ -105,7 +111,7 @@ def __init__(self, llm, tools: Optional[List[Any]] = None): self._prompts: Dict[str, str] = { stem: _load_prompt(stem) for stem in [ - "router", + "router_unified", "direct_answer", "diag_step1", "diag_step2", @@ -115,6 +121,10 @@ def __init__(self, llm, tools: Optional[List[Any]] = None): "ebm_prognosis", ] } + # Cache for the unified router payload within a single execute() call, + # so _run_router and _handle_full_pipeline can both consume it without + # making a second LLM call. + self._last_router_payload: Optional[dict] = None # ------------------------------------------------------------------ # Public entry point @@ -130,8 +140,10 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: question = state["original_question"] backtrack_context = self._build_backtrack_context(state) - # ── Step 1: Route ────────────────────────────────────────────── - route_result = self._run_router(question, backtrack_context) + # ── Step 1: Unified router (routes + structures non-Diagnosis query) ─ + self._last_router_payload = self._call_unified_router(question, backtrack_context) + route_result = self._last_router_payload + route_type = route_result.get("route_type", "full_pipeline") if route_type not in _VALID_ROUTE_TYPES: logger.warning("Unknown route_type '%s', defaulting to full_pipeline", route_type) @@ -183,12 +195,14 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: ) # ------------------------------------------------------------------ - # Router + # Router (unified — replaces V1's router.txt + framework prompt pair + # for non-Diagnosis full_pipeline questions) # ------------------------------------------------------------------ - def _run_router(self, question: str, backtrack_context: str) -> dict: - """Call the router prompt and return parsed JSON.""" - prompt = self._prompts["router"].format( + def _call_unified_router(self, question: str, backtrack_context: str) -> dict: + """Call the unified router prompt. Returns parsed JSON with routing + decision and (for non-Diagnosis full_pipeline) a `query` sub-object.""" + prompt = self._prompts["router_unified"].format( question=question, backtrack_context=backtrack_context, ) @@ -196,8 +210,9 @@ def _run_router(self, question: str, backtrack_context: str) -> dict: try: return robust_parse_json(response.content) except ValueError as exc: - logger.error("Router JSON parse failed: %s", exc) - return {"route_type": "full_pipeline", "question_type": "Therapy", "ebm_framework": "pico"} + logger.error("Unified router JSON parse failed: %s", exc) + return {"route_type": "full_pipeline", "question_type": "Therapy", + "ebm_framework": "pico", "query": None} # ------------------------------------------------------------------ # Route handlers @@ -304,13 +319,24 @@ def _handle_full_pipeline( route_confidence: float, backtrack_context: str, ) -> Dict[str, Any]: - """Structure the question into an EBMQuery and return state updates.""" - ebm = self._structure_question( - question=question, - question_type=question_type, - ebm_framework=ebm_framework, - backtrack_context=backtrack_context, - ) + """Structure the question into an EBMQuery and return state updates. + + For non-Diagnosis questions, the unified router already produced the + structured query, so no extra LLM call is needed. Diagnosis questions + still run diag_step1 + ebm_pird to preserve diagnostic reasoning quality. + """ + cached_query = (self._last_router_payload or {}).get("query") + if cached_query: + ebm = _ebm_query_from_dict(cached_query) + else: + # Diagnosis path (or unified router missed the query — fall back to + # the two-step structuring flow). + ebm = self._structure_question( + question=question, + question_type=question_type, + ebm_framework=ebm_framework, + backtrack_context=backtrack_context, + ) return { "route_type": "full_pipeline", "route_confidence": route_confidence, diff --git a/src/agents/assess_agent.py b/src/agents/assess_agent.py index df45e02..e344218 100644 --- a/src/agents/assess_agent.py +++ b/src/agents/assess_agent.py @@ -70,4 +70,27 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: backtrack_reason=assess_dict.get("backtrack_reason"), ) - return {"assessment": assessment} + # Hard GRADE gate: Strong recommendation requires assess quality_score ≥ 0.70. + # Apply already clamps on evidence_quality, but the deterministic Assess score + # captures completeness / strength_consistency / reasoning_chain gaps that Apply + # cannot see. If the final audit disagrees with the strength label, downgrade. + result: Dict[str, Any] = {"assessment": assessment} + if recommendation.strength == "Strong" and quality_score < 0.70: + downgraded = recommendation.model_copy(update={ + "strength": "Weak", + "caveats": list(recommendation.caveats) + [ + f"Strength 已由 Strong 自动下调为 Weak:Assess quality_score={quality_score:.2f} < 0.70," + "审计发现完整性/一致性/推理链/警示不足,不满足 Strong 推荐的硬门槛。" + ], + }) if hasattr(recommendation, "model_copy") else recommendation + if not hasattr(recommendation, "model_copy"): + downgraded.strength = "Weak" + downgraded.caveats = list(recommendation.caveats) + [ + f"Strength 已由 Strong 自动下调为 Weak:Assess quality_score={quality_score:.2f} < 0.70。" + ] + print( + f"[GRADE-CLAMP] Strong → Weak: quality_score={quality_score:.2f} < 0.70." + ) + result["recommendation"] = downgraded + + return result diff --git a/src/agents/base.py b/src/agents/base.py index cfcad79..bcdfed1 100644 --- a/src/agents/base.py +++ b/src/agents/base.py @@ -63,6 +63,18 @@ def robust_parse_json(content: str) -> dict: repaired = _attempt_json_repair(raw) try: return json.loads(repaired) + except json.JSONDecodeError: + pass + + # Stage 4: strip rationale/reasoning fields that may contain unescaped quotes + stripped = re.sub( + r'"(rationale|reasoning|explanation|summary)"\s*:\s*"(?:[^"\\]|\\.)*"', + r'"\1": ""', + repaired, + flags=re.DOTALL, + ) + try: + return json.loads(stripped) except json.JSONDecodeError as final_err: raise ValueError( f"JSON parse failed after repair attempt.\n" @@ -71,6 +83,23 @@ def robust_parse_json(content: str) -> dict: ) +# Marker used in prompt templates to separate the static system portion from +# the variable user portion. Splitting on this marker enables system-message +# prompt caching at the huatuogpt.cn gateway (verified 2026-05-18: cuts +# prompt_tokens by ~98% on repeated calls with the same static prefix). +SYSTEM_USER_MARKER = "%%USER_INPUT_BELOW%%" + + +def split_prompt_for_caching(formatted: str) -> dict | str: + """Split a formatted prompt on SYSTEM_USER_MARKER into a system+user dict + suitable for passing to _LLMClient.invoke(). If the marker is absent, + returns the original string unchanged.""" + if SYSTEM_USER_MARKER not in formatted: + return formatted + system, user = formatted.split(SYSTEM_USER_MARKER, 1) + return {"system": system.strip(), "user": user.strip()} + + class BaseAgent(ABC): """Base class for all agents""" diff --git a/src/config/llm_config.py b/src/config/llm_config.py index 49e516e..0a499d3 100644 --- a/src/config/llm_config.py +++ b/src/config/llm_config.py @@ -1,44 +1,256 @@ import os -from langchain_openai import ChatOpenAI +import random +import time +import openai from dotenv import load_dotenv load_dotenv() +# Per-call timeout (seconds). Caps single LLM call wall time so a hung gateway +# can't eat the whole 900s pipeline budget. Tunable via env. +_LLM_CALL_TIMEOUT = float(os.getenv("EBM_LLM_CALL_TIMEOUT", "90")) +_LLM_MAX_RETRIES = int(os.getenv("EBM_LLM_MAX_RETRIES", "3")) +# Long backoff for 429 rate-limit responses. Most gateways sliding-window over +# 60s+, so retrying after a few seconds just burns through the quota faster. +_LLM_RATE_LIMIT_BACKOFF = float(os.getenv("EBM_LLM_RATE_LIMIT_BACKOFF", "60")) +_LLM_RATE_LIMIT_MAX_RETRIES = int(os.getenv("EBM_LLM_RATE_LIMIT_MAX_RETRIES", "5")) -def get_llm(temperature: float = 0.0) -> ChatOpenAI: - """ - Get configured LLM instance for main agent tasks. +# Per-purpose client pool — each key gets its own openai.OpenAI instance with +# its own HTTP connection pool, preventing pipeline agents from blocking Judge/Scheduling. +_clients: dict[str, openai.OpenAI] = {} - Args: - temperature: Sampling temperature (0.0 = deterministic, 1.0 = creative) - Returns: - Configured ChatOpenAI instance - """ - return ChatOpenAI( - base_url=os.getenv("LLM_BASE_URL", "https://api.openai.com/v1"), - api_key=os.getenv("LLM_API_KEY", ""), - model=os.getenv("LLM_MODEL", "gpt-4"), - temperature=temperature, - ) +def _get_client(purpose: str) -> openai.OpenAI: + if purpose not in _clients: + _clients[purpose] = openai.OpenAI( + base_url=os.getenv("LLM_BASE_URL", "https://api.openai.com/v1"), + api_key=os.getenv("LLM_API_KEY", ""), + timeout=_LLM_CALL_TIMEOUT, + max_retries=0, + ) + return _clients[purpose] + + +# Transient errors worth retrying with exponential backoff. 5xx, connection +# drops, and read timeouts at the gateway layer are usually safe to retry; other +# 4xx (400 bad request, 401 auth) are not. 429 is retryable but uses a much +# longer dedicated backoff (see _call_with_retry). +_RETRYABLE_EXC = ( + openai.APIConnectionError, + openai.APITimeoutError, + openai.InternalServerError, +) + + +def _retry_after_seconds(exc) -> float | None: + """Pull a Retry-After value out of a RateLimitError, if the gateway supplied one.""" + resp = getattr(exc, "response", None) + if resp is None: + return None + try: + val = resp.headers.get("Retry-After") or resp.headers.get("retry-after") + except Exception: + return None + if val is None: + return None + try: + return float(val) + except (TypeError, ValueError): + return None + + +def _call_with_retry(fn, label: str): + last_exc = None + rate_limit_attempts = 0 + for attempt in range(_LLM_MAX_RETRIES): + try: + return fn() + except openai.RateLimitError as exc: + last_exc = exc + if rate_limit_attempts >= _LLM_RATE_LIMIT_MAX_RETRIES - 1: + break + rate_limit_attempts += 1 + # Sliding-window quotas reset over tens of seconds; short backoff just + # consumes more quota slots. Respect Retry-After when present, else + # use a long fixed backoff with light jitter. + hinted = _retry_after_seconds(exc) + backoff = hinted if hinted is not None else (_LLM_RATE_LIMIT_BACKOFF + random.uniform(0, 10)) + print(f"[LLM-RETRY] {label} rate-limited (429) attempt {rate_limit_attempts}/{_LLM_RATE_LIMIT_MAX_RETRIES}. Backing off {backoff:.1f}s") + time.sleep(backoff) + except _RETRYABLE_EXC as exc: + last_exc = exc + if attempt == _LLM_MAX_RETRIES - 1: + break + backoff = (2 ** attempt) + random.uniform(0, 0.5) + print(f"[LLM-RETRY] {label} attempt {attempt + 1}/{_LLM_MAX_RETRIES} failed: {type(exc).__name__}: {exc}. Backing off {backoff:.1f}s") + time.sleep(backoff) + raise last_exc + + +class _LLMResponse: + """Minimal response wrapper with .content attribute.""" + def __init__(self, content: str, usage=None, ttft=None, elapsed=None): + self.content = content + self.usage = usage + self.ttft = ttft + self.elapsed = elapsed + + +# Aggregate cache-hit telemetry across the session so we can confirm whether +# the upstream gateway supports OpenAI-style automatic prompt caching. +_cache_stats = {"calls": 0, "prompt_tokens": 0, "cached_tokens": 0} + +# Per-purpose TTFT samples for streaming-mode runs. +_ttft_samples: dict[str, list[dict]] = {} + + +def get_cache_stats() -> dict: + """Return a snapshot of per-session prompt-cache telemetry.""" + return dict(_cache_stats) -def get_fast_llm(temperature: float = 0.0) -> ChatOpenAI: - """ - Get a faster/cheaper LLM instance for classification tasks (Judge, Scheduling). +def get_ttft_samples() -> dict: + """Return all TTFT samples grouped by purpose.""" + return {k: list(v) for k, v in _ttft_samples.items()} - Falls back to the main LLM if FAST_LLM_MODEL is not set. - Configure via FAST_LLM_MODEL env var (e.g. claude-sonnet-4-6, gpt-4o-mini). - Args: - temperature: Sampling temperature +class _LLMClient: + """Thin wrapper around openai.OpenAI that mimics langchain's .invoke() interface. - Returns: - Configured ChatOpenAI instance - """ - return ChatOpenAI( - base_url=os.getenv("LLM_BASE_URL", "https://api.openai.com/v1"), - api_key=os.getenv("LLM_API_KEY", ""), + By default uses non-streaming (server returns a complete response). Streaming + mode is available via env `EBM_STREAM_TTFT=1` for TTFT measurement, but it + was observed that the huatuogpt.cn gateway can inject a literal string + "[Error: Stream interrupted: NetworkError]" into a chunk when its upstream + connection drops, which silently corrupts the response content. Production + runs therefore stay on non-streaming.""" + + def __init__(self, purpose: str, model: str, temperature: float): + self._purpose = purpose + self._model = model + self._temperature = temperature + + def invoke(self, prompt) -> _LLMResponse: + # Some Anthropic-compatible gateways (e.g. hk.oarel.com) reject role="system" + # in the messages array — they expect the Anthropic Messages API shape where + # `system` is a top-level parameter. Setting EBM_FOLD_SYSTEM_INTO_USER=1 + # concatenates system content into the first user message instead. Default ON + # because it works on both gateway styles (the OpenAI ones treat the extra + # prefix as ordinary user text and still hit prefix cache on the static part). + fold_system = os.getenv("EBM_FOLD_SYSTEM_INTO_USER", "1") != "0" + + if isinstance(prompt, dict) and "system" in prompt and "user" in prompt: + if fold_system: + messages = [ + {"role": "user", "content": f"{prompt['system']}\n\n{prompt['user']}"}, + ] + else: + # System+user split: maximizes prefix caching on the static portion. + messages = [ + {"role": "system", "content": prompt["system"]}, + {"role": "user", "content": prompt["user"]}, + ] + elif isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + elif isinstance(prompt, list): + role_map = {"human": "user", "ai": "assistant", "system": "system"} + converted = [ + {"role": role_map.get(getattr(m, "type", "user"), "user"), "content": m.content} + for m in prompt + ] + if fold_system and converted and converted[0]["role"] == "system": + sys_content = converted[0]["content"] + rest = converted[1:] + if rest and rest[0]["role"] == "user": + rest[0] = {"role": "user", "content": f"{sys_content}\n\n{rest[0]['content']}"} + messages = rest + else: + messages = [{"role": "user", "content": sys_content}] + rest + else: + messages = converted + else: + messages = [{"role": "user", "content": str(prompt)}] + + if os.getenv("EBM_STREAM_TTFT") == "1": + return self._invoke_streaming(messages) + return self._invoke_blocking(messages) + + def _invoke_blocking(self, messages) -> _LLMResponse: + t0 = time.time() + resp = _call_with_retry( + lambda: _get_client(self._purpose).chat.completions.create( + model=self._model, + messages=messages, + temperature=self._temperature, + ), + label=f"{self._purpose}/blocking", + ) + elapsed = time.time() - t0 + usage = getattr(resp, "usage", None) + self._record_telemetry(usage, ttft=None, elapsed=elapsed) + return _LLMResponse( + resp.choices[0].message.content, usage=usage, ttft=None, elapsed=elapsed + ) + + def _invoke_streaming(self, messages) -> _LLMResponse: + t0 = time.time() + ttft = None + chunks: list[str] = [] + usage = None + stream = _get_client(self._purpose).chat.completions.create( + model=self._model, + messages=messages, + temperature=self._temperature, + stream=True, + stream_options={"include_usage": True}, + ) + for chunk in stream: + if chunk.choices: + delta = chunk.choices[0].delta if chunk.choices[0] else None + if delta is not None and getattr(delta, "content", None): + if ttft is None: + ttft = time.time() - t0 + chunks.append(delta.content) + chunk_usage = getattr(chunk, "usage", None) + if chunk_usage is not None: + usage = chunk_usage + elapsed = time.time() - t0 + content = "".join(chunks) + self._record_telemetry(usage, ttft=ttft, elapsed=elapsed) + return _LLMResponse(content, usage=usage, ttft=ttft, elapsed=elapsed) + + def _record_telemetry(self, usage, ttft, elapsed): + prompt_tokens = 0 + cached = 0 + if usage is not None: + _cache_stats["calls"] += 1 + prompt_tokens = getattr(usage, "prompt_tokens", 0) or 0 + _cache_stats["prompt_tokens"] += prompt_tokens + details = getattr(usage, "prompt_tokens_details", None) + if details is not None: + cached = getattr(details, "cached_tokens", 0) or 0 + elif isinstance(usage, dict): + cached = (usage.get("prompt_tokens_details") or {}).get("cached_tokens", 0) + _cache_stats["cached_tokens"] += cached + _ttft_samples.setdefault(self._purpose, []).append({ + "ttft": ttft, + "elapsed": elapsed, + "prompt_tokens": prompt_tokens, + "cached_tokens": cached, + }) + + +def get_llm(temperature: float = 0.0, purpose: str = "agent") -> _LLMClient: + return _LLMClient( + purpose=purpose, + model=os.getenv("LLM_MODEL", "gpt-4"), + temperature=temperature, + ) + + +def get_fast_llm(temperature: float = 0.0, purpose: str = "fast") -> _LLMClient: + return _LLMClient( + purpose=purpose, model=os.getenv("FAST_LLM_MODEL", os.getenv("LLM_MODEL", "gpt-4")), temperature=temperature, ) + diff --git a/src/config/prompts/acquire_agent.txt b/src/config/prompts/acquire_agent.txt index 1f1ef9a..21c1cb7 100644 --- a/src/config/prompts/acquire_agent.txt +++ b/src/config/prompts/acquire_agent.txt @@ -1,17 +1,5 @@ You are an expert systematic review librarian with extensive experience constructing searches for Cochrane reviews and clinical practice guidelines. You translate clinical PICO questions into comprehensive PubMed Boolean queries, drawing on your deep knowledge of how clinical research is actually reported in the literature. -**Input Data:** -PICO Components: -- Patient: {patient} -- Intervention: {intervention} -- Comparison: {comparison} -- Outcome: {outcome} -- Keywords: {keywords} - -**Previous Search Feedback (if any):** -{backtrack_context} -(Note: If feedback indicates "0 results", broaden the search by removing restrictive terms or focusing only on P and I. If "too many results", apply stricter filters.) - **Worked Example (study this translation carefully):** PICO: Patients with AMI / Aspirin / No aspirin / Mortality @@ -35,3 +23,19 @@ Why: Clinical trials never describe their control arm as "no aspirin" — they u ```pubmed ((Patient Term 1 OR Patient Term 2) AND (Intervention Term 1 OR Intervention Term 2)) ``` + +%%USER_INPUT_BELOW%% + +Translate the following PICO into a PubMed Boolean query. Output **strictly** in the response format specified above: a brief **Reasoning:** section, then the query inside a ```pubmed code block. Do NOT output a free-form explanation; do NOT propose multiple alternative queries. + +**Input Data:** +PICO Components: +- Patient: {patient} +- Intervention: {intervention} +- Comparison: {comparison} +- Outcome: {outcome} +- Keywords: {keywords} + +**Previous Search Feedback (if any):** +{backtrack_context} +(Note: If feedback indicates "0 results", broaden the search by removing restrictive terms or focusing only on P and I. If "too many results", apply stricter filters.) diff --git a/src/config/prompts/apply_agent.txt b/src/config/prompts/apply_agent.txt index 759c0e5..8903845 100644 --- a/src/config/prompts/apply_agent.txt +++ b/src/config/prompts/apply_agent.txt @@ -35,11 +35,14 @@ Before formulating a recommendation, verify that the retrieved evidence is appli **Step 2 - Evidence Sufficiency Assessment:** Determine if the evidence is sufficient to make a recommendation: - If no relevant evidence was retrieved → output "No Recommendation" -- If evidence quality is Very Low AND results are inconclusive/conflicting AND no indirect evidence available → output "Insufficient Evidence" +- If evidence quality is Very Low AND results are **conflicting** (opposing directions) AND no indirect evidence available → output "Insufficient Evidence" +- If evidence quality is Very Low AND results are **consistent** (same direction, even if imprecise) → proceed to Step 3 with "Weak" and explicit caveats about low certainty - If direct evidence is absent but consistent indirect evidence exists (different population, surrogate endpoint, similar intervention) → proceed to Step 3 with "Conditional" - If no direct or indirect study evidence, but expert consensus / clinical practice guidelines support a recommendation → proceed to Step 3 with "Consensus-based" - If evidence is Low quality or above AND results are consistent → proceed to Step 3 +> **Important**: "Insufficient Evidence" should be reserved for situations where results genuinely conflict or no relevant evidence exists at all. Do NOT output "Insufficient Evidence" simply because evidence quality is Very Low — consistent Very Low evidence still supports a Weak recommendation with caveats. + **Step 3 - Recommendation Formulation (only if evidence is sufficient):** - Strength must strictly follow evidence quality: - High/Moderate quality + consistent results → "Strong" @@ -49,11 +52,16 @@ Determine if the evidence is sufficient to make a recommendation: - Only expert consensus / guidelines, no direct study evidence → "Consensus-based" (cite the specific guideline or consensus) - Do NOT upgrade strength based on clinical intuition or training knowledge. -**Step 4 - Address Previous Feedback:** +**Step 4 - Determine Adopted Evidence Quality:** +Based on Steps 1-3, identify which evidence items you are actually adopting for this recommendation (i.e., evidence that is sufficiently direct and relevant to the PICO). Report the collective GRADE quality of ONLY the adopted evidence as `evidence_quality`. Do NOT average in evidence that you rejected due to population mismatch, wrong comparator, or irrelevant outcomes. +- If strength is "Insufficient Evidence" or "No Recommendation" → set `evidence_quality` to "Very Low" +- If only indirect evidence was adopted (Conditional) → set `evidence_quality` to the grade of that indirect evidence + +**Step 5 - Address Previous Feedback:** If there is "Previous Attempt Feedback", explicitly acknowledge each issue and explain how you are correcting it. **Language Requirement:** -All JSON output fields (`recommendation`, `rationale`, `caveats`) **must be written in Chinese**, as this is the final user-facing output. The `strength` field uses the fixed English labels below. +All JSON output fields (`recommendation`, `rationale`, `caveats`) **must be written in Chinese**, as this is the final user-facing output. The `strength` and `evidence_quality` fields use the fixed English labels below. **Output Format:** @@ -62,14 +70,16 @@ All JSON output fields (`recommendation`, `rationale`, `caveats`) **must be writ 1. PICO Consistency Check results (population/intervention/outcome match assessment) 2. Evidence sufficiency assessment (why sufficient or not) 3. Strength determination logic (which specific evidence supports which conclusion) -4. How you addressed any previous feedback +4. Adopted evidence quality: which items were adopted and why, and their collective GRADE level +5. How you addressed any previous feedback > **JSON:** ```json {{ - "recommendation": "【中文】具体可执行的推荐内容,或"证据不足:[原因]",或"无法推荐:[原因]"", + "recommendation": "【中文】具体可执行的推荐内容,或\"证据不足:[原因]\",或\"无法推荐:[原因]\"", "strength": "Strong or Weak or Conditional or Consensus-based or Insufficient Evidence or No Recommendation", + "evidence_quality": "High or Moderate or Low or Very Low", "rationale": "【中文】引用具体证据条目支持推荐,如来自不同人群需明确说明", "caveats": [ "【中文】列出PICO不匹配问题(如:证据来自成人人群,儿童适用性不确定)", @@ -77,3 +87,4 @@ All JSON output fields (`recommendation`, `rationale`, `caveats`) **must be writ ] }} ``` + diff --git a/src/config/prompts/appraise_agent.txt b/src/config/prompts/appraise_agent.txt index 13ccff3..0462227 100644 --- a/src/config/prompts/appraise_agent.txt +++ b/src/config/prompts/appraise_agent.txt @@ -17,7 +17,9 @@ ## GRADE分类规则 ### 一、研究类型(study_type) -仅根据摘要和来源判断研究设计类型(**必须从以下列表中选择一个**): +根据以下优先级判断研究设计类型(**必须从以下列表中选择一个**): + +> **优先级规则**:若证据条目中提供了 `PubMed pub_types` 字段,**优先以此为准**,不要依赖摘要文字猜测。映射关系:`Randomized Controlled Trial` → `RCT`;`Meta-Analysis` → `META_ANALYSIS`;`Systematic Review` → `SYSTEMATIC_REVIEW`;`Observational Study` / `Cohort Study` → `COHORT`;`Case-Control Study` → `CASE_CONTROL`;`Practice Guideline` / `Guideline` → `GUIDELINE`;`Review`(不含 Systematic Review)→ `NARRATIVE_REVIEW`。仅在 pub_types 缺失或无法映射时,再参考摘要文字判断。 - `RCT`:随机对照试验(初始等级:High) - `SYSTEMATIC_REVIEW`:系统综述(含明确检索方案和纳入标准,可含或不含Meta分析)(初始等级:High) - `META_ANALYSIS`:Meta分析(对多项原始研究进行定量统计合并)(初始等级:High) @@ -39,26 +41,57 @@ ### 二、降级因素(每项独立判断,基于摘要可推断的信息) +> **研究类型豁免规则(GRADE标准)**:`NARRATIVE_REVIEW` 和 `EXPERT_OPINION` 初始等级本就是 Very Low,四项降级因素**一律填 `NA`**,升级因素同样填 `NA`,`upgrade_blocked_by_bias` 填 `false`。不要对这两类研究类型强行套用降级框架。 +> +> **NA 的使用范围严格限定**:除 NARRATIVE_REVIEW/EXPERT_OPINION 外,RCT、SR/MA/NMA、COHORT、CASE_CONTROL、GUIDELINE 等所有其他研究类型,**必须**在 NOT_SERIOUS / SERIOUS / VERY_SERIOUS 中选择,不允许填 NA。"摘要截断导致信息不足"不是填 NA 的理由,信息不足时应保守评估(见各字段说明)。 + **risk_of_bias(偏倚风险)** -- `NOT_SERIOUS`:无明显偏倚风险(如RCT报告了正规随机化和盲法) -- `SERIOUS`:存在明显偏倚风险(如未提及盲法、开放标签、缺乏分配隐藏)→ **-1级** + +> **大型RCT豁免规则**:若摘要明确描述了以下任意一项,`risk_of_bias` 应为 `NOT_SERIOUS`,不得仅因"摘要未详述随机序列/分配隐藏方法"而降为 SERIOUS: +> - "double-blind, placebo-controlled"(双盲安慰剂对照) +> - "randomized, double-blind" +> - 大型多中心 RCT(样本量 >1000)且摘要已明确双盲或安慰剂对照 +> +> 判 SERIOUS 的条件:摘要**没有**提及盲法/随机化,或明确说明存在开放标签、无对照、选择偏倚等问题。 + +- `NOT_SERIOUS`:无明显偏倚风险(RCT报告了盲法/随机化,或摘要已明确双盲安慰剂对照) +- `SERIOUS`:存在明显偏倚风险,或摘要**完全未提及**盲法/随机化方法 → **-1级** - `VERY_SERIOUS`:存在严重偏倚风险(如无对照、严重选择偏倚)→ **-2级** +- `NA`:**仅限** NARRATIVE_REVIEW / EXPERT_OPINION **inconsistency(不一致性)** -- `NOT_SERIOUS`:当前证据内部或与其他证据结论一致 -- `SERIOUS`:与其他证据结论存在明显不一致 → **-1级** -- `VERY_SERIOUS`:结论严重不一致,无法解释 → **-2级** -- `NA`:仅有一篇研究,不适用 + +> **单篇研究强制规则**:`inconsistency` 评估的是**同一证据体内多项研究之间**的结论一致性。对于**单篇独立研究**(RCT、COHORT、CASE_CONTROL、CROSS_SECTIONAL 等非 SR/MA/NMA),**必须填 `NA`**,不允许填 NOT_SERIOUS/SERIOUS/VERY_SERIOUS。 +> +> 不得将"与证据列表中其他研究的 PICO 不同"解读为单篇研究的 inconsistency,那属于 indirectness 的范畴。 + +- `NOT_SERIOUS`:(仅适用于 SR/MA/NMA)纳入研究间结论一致,无明显异质性 +- `SERIOUS`:(仅适用于 SR/MA/NMA)纳入研究间存在明显不一致 → **-1级** +- `VERY_SERIOUS`:(仅适用于 SR/MA/NMA)结论严重不一致,I²极高 → **-2级** +- `NA`:**单篇研究**(RCT/COHORT等),或 NARRATIVE_REVIEW / EXPERT_OPINION **indirectness(间接性)** -- `NOT_SERIOUS`:研究人群、干预和结局与PICO基本吻合 -- `SERIOUS`:存在明显间接性(如不同人群、代理结局)→ **-1级** -- `VERY_SERIOUS`:严重间接性(如完全不同的人群或干预)→ **-2级** + +> **判断基准**:indirectness 只针对**该研究本身**的 P/I/C/O 与当前目标 PICO 的匹配程度,不与证据列表中其他研究比较。即使其他研究的 PICO 与目标不同,也不影响本研究的 indirectness 评分。 + +判断时逐一核对**该研究**的 P/I/C/O 与目标 PICO 的对应要素: + +- `NOT_SERIOUS`:研究人群的疾病诊断/阶段与目标 PICO 一致,干预和结局也基本吻合 +- `SERIOUS`:存在以下任一情形 → **-1级** + - 研究人群的**疾病诊断或疾病阶段**与目标 PICO 不同(如糖尿病前期 vs 确诊 T2DM) + - 研究报告的是**代理结局**(如 HbA1c)而目标 PICO 要求**临床结局**(如心血管事件) + - 干预或对照与目标 PICO 存在明显差异 +- `VERY_SERIOUS`:上述多项同时存在,或该研究的人群/干预与目标 PICO 完全不同 → **-2级** +- `NA`:**仅限** NARRATIVE_REVIEW / EXPERT_OPINION **imprecision(不精确性)** -- `NOT_SERIOUS`:样本量足够,置信区间窄,结论稳定 -- `SERIOUS`:样本量有限或置信区间宽,存在不确定性 → **-1级** + +> **摘要截断处理原则**:若摘要中无效应量/CI/样本量信息,不能直接填 NA。应根据研究类型和规模综合判断:大型RCT/SR通常样本量充足可填 NOT_SERIOUS;若完全无法判断,保守填 SERIOUS;`NA` 仅限 NARRATIVE_REVIEW/EXPERT_OPINION。 + +- `NOT_SERIOUS`:样本量足够,置信区间窄,结论稳定;或大型研究摘要虽无CI但规模已知足够 +- `SERIOUS`:样本量有限、置信区间宽,或无法从摘要评估精确性 → **-1级** - `VERY_SERIOUS`:样本量极少或置信区间极宽,结论极不稳定 → **-2级** +- `NA`:**仅限** NARRATIVE_REVIEW / EXPERT_OPINION **publication_bias(发表偏倚)** - `SUSPECTED`:有发表偏倚迹象(如只有阳性结果的小样本研究)→ **-1级** @@ -131,10 +164,10 @@ "evidence_id": 1, "study_type": "RCT | SYSTEMATIC_REVIEW | META_ANALYSIS | NMA | COHORT | CASE_CONTROL | CROSS_SECTIONAL | NARRATIVE_REVIEW | CASE_REPORT | GUIDELINE | EXPERT_OPINION", "included_study_type": "RCT | OBSERVATIONAL | MIXED | UNKNOWN | NA", - "risk_of_bias": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS", + "risk_of_bias": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS | NA", "inconsistency": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS | NA", - "indirectness": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS", - "imprecision": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS", + "indirectness": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS | NA", + "imprecision": "NOT_SERIOUS | SERIOUS | VERY_SERIOUS | NA", "publication_bias": "SUSPECTED | UNDETECTED", "large_effect": "YES | NO | NA", "dose_response": "YES | NO | NA", @@ -157,3 +190,4 @@ "evidence_summary": "对整体证据体的简要总结,包括最高层级证据类型和整体质量特征" }} ``` + diff --git a/src/config/prompts/ask/router_unified.txt b/src/config/prompts/ask/router_unified.txt new file mode 100644 index 0000000..6fba2f7 --- /dev/null +++ b/src/config/prompts/ask/router_unified.txt @@ -0,0 +1,142 @@ +You are a clinical question triage and structuring expert for an Evidence-Based Medicine (EBM) decision-support system. + +Your job has **two parts** in a single response: +1. **Route classification**: classify the question into one of three categories, and identify question type + EBM framework. +2. **Query structuring** (only when applicable): for non-Diagnosis full-pipeline questions, also produce the structured PICO/PEO/Prognosis query so the downstream pipeline can search literature directly. + +Diagnosis questions and sub-question decompositions do NOT include structured query in this step — they are handled by follow-up prompts. + +--- + +## Step 1 — Route Classification + +Classify the question into exactly one of: + +**direct_answer** — The question does NOT require a literature search. Use this when: +- Factual/definitional query answerable from established medical knowledge (e.g., "What is the normal range of serum sodium?", "What does LVEF stand for?") +- Drug dosing, pharmacokinetics, or well-established clinical thresholds +- Clearly outside the scope of clinical EBM (administrative, billing, non-clinical) +- Confidence that a literature search would add no value: > 0.85 + +**full_pipeline** — The question requires a full EBM literature search (Ask → Acquire → Appraise → Apply). Use this when: +- The question involves treatment efficacy, diagnostic accuracy, prognosis, harm, or prevention +- The answer depends on the current state of clinical evidence +- The question has a single, well-defined PICO/PIRD/PEO/Prognosis focus + +**sub_questions** — The question is complex and must be decomposed into 2–4 independent sub-questions, each requiring its own literature search. Use this when: +- The question contains multiple distinct clinical decisions (e.g., "Should I use drug A or B, and what monitoring is needed?") +- The question spans multiple EBM domains (e.g., both diagnosis and treatment) +- A single PICO cannot capture the full scope + +--- + +## Step 2 — Question Type Classification + +Classify into one of: +- `Therapy`: Treatment efficacy, drug comparisons, interventions +- `Diagnosis`: Diagnostic test accuracy, sensitivity/specificity, screening +- `Prognosis`: Disease course, survival, risk factors, long-term outcomes +- `Harm`: Adverse effects, risks, harmful exposures +- `Prevention`: Preventive interventions before disease onset +- `Background`: Factual/definitional questions (use with direct_answer route) +- `Mixed`: Spans multiple types (use with sub_questions route) + +**Therapy priority rule (apply before defaulting to Mixed)**: +If the question explicitly asks for any of the following — treatment plan, drug choice / dose / duration, regimen, monitoring under therapy, "治疗方案 / 用药 / 剂量 / 疗程 / 推荐方案 / 一线治疗 / 首选药物 / 给药" — classify as `Therapy` even when the case mentions multiple comorbidities. Comorbidities affect PICO patient definition but do NOT promote the type to `Mixed`. Only use `Mixed` when the question contains two or more distinct EBM intents (e.g., both "如何诊断" AND "如何治疗" in the same sentence). + +--- + +## Step 3 — EBM Framework Selection (for full_pipeline and sub_questions only) + +Select the most appropriate query framework: +- `pico`: Therapy / Prevention / Harm questions (Patient, Intervention, Comparison, Outcome) +- `pird`: Diagnosis questions (Patient, Index test, Reference standard, Diagnosis target) +- `peo`: Epidemiology / Harm / Prognosis questions without a direct intervention (Patient, Exposure, Outcome) +- `prognosis`: Pure prognosis questions (Patient, Prognostic factor, Outcome, Time horizon) +- `diagnostic_reasoning`: Complex differential diagnosis requiring clinical reasoning steps + +--- + +## Step 4 — Structured Query (NEW: do this in the SAME response) + +**Skip Step 4 when**: +- `route_type` = "direct_answer" — leave `query` as null +- `route_type` = "sub_questions" — leave `query` as null (each sub-question is structured separately) +- `question_type` = "Diagnosis" or `ebm_framework` ∈ {{"pird", "diagnostic_reasoning"}} — leave `query` as null (diagnostic questions require a separate two-step diagnostic reasoning pass) + +**Otherwise (full_pipeline + non-Diagnosis)**, produce the structured query based on the selected `ebm_framework`: + +### 4a. ebm_framework = "pico" (Therapy / Prevention / Harm) +- **P — Patient/Problem**: specific population, disease, or condition. Include age/sex/stage/comorbidities if stated or clearly implied. +- **I — Intervention** (`primary_focus`): the main treatment, drug, procedure, or exposure being evaluated. +- **C — Comparison** (`comparator`): alternative treatment, placebo, standard of care, or "unexposed population" for harm. If not stated, infer the most clinically appropriate comparator. +- **O — Outcome**: primary clinical outcome(s) (mortality, recurrence, AE, QoL, hospitalization). + +### 4b. ebm_framework = "peo" (Epidemiology / Harm / Etiology) +- **P — Population** (`patient`): the population being studied. +- **E — Exposure** (`primary_focus`): risk factor, environmental condition, genetic variant, etc. NOT an active intervention. +- **O — Outcome**: health outcome of interest. +- Set `comparator` to the reference / unexposed group (or null). +- Set `exposure_type` to one of: `risk_factor | environmental | genetic | behavioral | iatrogenic | comorbidity`. + +### 4c. ebm_framework = "prognosis" (Pure prognosis questions) +- **P — Patient**: population with the disease/condition. +- **Prognostic Factor** (`primary_focus`): the factor whose prognostic value is assessed (e.g., "LVEF < 35%", "KRAS mutation"). If the question is about overall prognosis, use the disease itself. +- **O — Outcome**: clinical outcome (5-year OS, DFS, recurrence, mortality, etc.). +- **T — Time Horizon** (`time_horizon`): the time frame (e.g., "5-year survival"). Infer if not stated. +- Set `comparator` to the reference group (or null). +- Set `prognosis_subtype` to one of: `overall_prognosis | prognostic_factor | survival_analysis | risk_stratification | disease_progression`. + +--- + +## Language Rules + +- `routing_rationale`, `direct_answer_text`, `sub_question_texts`: same language as input question. +- `query.patient`, `query.primary_focus`, `query.comparator`, `query.outcome`, `query.time_horizon`: **same language as input question**. +- `query.keywords`: **always English MeSH terms and synonyms**, regardless of input language. PubMed only supports English queries. + +--- + +## Output Format + +**Reasoning:** +<Brief analysis: route, question type, EBM framework. If full_pipeline and not Diagnosis, also note the PICO/PEO/Prognosis decomposition. If direct_answer, indicate the answer will follow in a separate prompt — do NOT answer here.> + +**JSON:** +```json +{{ + "route_type": "direct_answer | full_pipeline | sub_questions", + "route_confidence": 0.0, + "question_type": "Therapy | Diagnosis | Prognosis | Harm | Prevention | Background | Mixed", + "ebm_framework": "pico | pird | peo | prognosis | diagnostic_reasoning | null", + "direct_answer_text": "null — the direct answer (when applicable) is produced by a separate prompt downstream", + "sub_question_texts": ["sub-question 1", "sub-question 2"] or null, + "routing_rationale": "One sentence explaining the routing decision in input language", + "query": null OR {{ + "query_type": "pico | peo | prognosis", + "patient": "【Patient population in input language】", + "primary_focus": "【Intervention/Exposure/Prognostic factor in input language】", + "comparator": "【Comparison in input language, or null】", + "outcome": "【Primary outcome(s) in input language】", + "reference_standard": null, + "time_horizon": "【Time horizon in input language, or null】", + "exposure_type": "【only when query_type=peo】 risk_factor | environmental | genetic | behavioral | iatrogenic | comorbidity", + "prognosis_subtype": "【only when query_type=prognosis】 overall_prognosis | prognostic_factor | survival_analysis | risk_stratification | disease_progression", + "keywords": [ + "patient MeSH term", + "patient synonym", + "intervention/exposure MeSH term", + "intervention/exposure synonym", + "outcome MeSH term", + "outcome synonym" + ] + }} +}} +``` + +--- + +## Input + +Clinical Question: {question} +Backtrack Context (if any): {backtrack_context} diff --git a/src/config/prompts/judge/acquire_judge.txt b/src/config/prompts/judge/acquire_judge.txt index e338b2f..86ea280 100644 --- a/src/config/prompts/judge/acquire_judge.txt +++ b/src/config/prompts/judge/acquire_judge.txt @@ -41,7 +41,13 @@ Acquire Agent 输出(已排序的证据列表):{stage_output} - PARTIAL:同类方法但有差异(不同剂量/版本),相关性高 - NO:完全不同的测试/干预/暴露 -## R3. outcome_match【Critical,权重3】 +## R2b. p_match【Critical,权重3】 +基于证据列表中人群匹配度最好的那篇证据判断:研究人群是否与查询 Patient 匹配? +- YES:精准匹配(相同年龄段、相同疾病状态) +- PARTIAL:有轻微差异,结论可审慎外推 +- NO:严重不匹配(成人证据用于儿科;完全不同疾病) + +## R3. o_match【Critical,权重3】 基于证据列表中结局匹配度最好的那篇证据判断:证据是否报告了临床关心的结局指标? - YES:报告了直接结局指标 - PARTIAL:报告了代理指标或部分相关结局 @@ -66,9 +72,13 @@ Acquire Agent 输出(已排序的证据列表):{stage_output} - ebm_pird:第一优先级=SR/Meta分析(基于诊断准确性研究),第二=诊断准确性研究(横断面),第三=回顾性诊断研究,排除=机制综述/治疗类RCT - ebm_peo:第一优先级=SR/Meta分析(基于观察性研究),第二=前瞻性队列,第三=病例对照,排除=RCT/机制综述 - ebm_prognosis:第一优先级=SR/Meta分析(基于队列研究),第二=前瞻性队列,第三=回顾性队列,排除=机制综述/病例报告 + +**重要原则:PARTIAL 反映该临床领域文献结构限制,不代表检索失败,不需要重新检索。** +如果该领域本身缺乏第一优先级文献(如罕见病、新兴干预、特定诊断测试),有次优先级文献已是合理结果。 + - YES:有第一优先级文献 -- PARTIAL:有次优先级文献但无第一优先级,或混入少量不匹配设计 -- NO:大量纳入与 route_type 不匹配的研究设计 +- PARTIAL:有次优先级文献但无第一优先级(**此为该领域文献限制,非检索错误**);或混入少量不匹配设计但主体匹配 +- NO:大量纳入与 route_type 完全不匹配的研究设计(如 PIRD 问题全部是治疗 RCT,无任何诊断准确性研究) ## R7. population_match【Major,权重2】 基于证据列表中人群匹配度最好的那篇证据判断:研究人群是否与查询 Patient 匹配? @@ -94,6 +104,13 @@ Top 文章的 key_sentences 字段是否有实质内容? - PARTIAL:部分文章 key_sentences 为空(摘要极短导致 chunk 失败) - NO:所有文章 key_sentences 均为空,RAG 流程可能失败 +## 5. 全文与 RAG 质量审计(full_text_audit) + +**full_text_coverage**:Top 文章(排名前3)中,has_full_text=True 的比例是否合理? +- `GOOD`:≥2/3 篇有全文 +- `PARTIAL`:1/3 篇有全文,或全文获取部分失败 +- `NONE`:Top 3 篇均无全文 + # Output Format 仅输出以下 JSON,不要包含任何其他文本: @@ -105,7 +122,8 @@ Top 文章的 key_sentences 字段是否有实质内容? "rubric_results": {{ "keywords_cover_pico_dimensions": "YES | PARTIAL | NO", "primary_focus_match": "YES | PARTIAL | NO", - "outcome_match": "YES | PARTIAL | NO", + "p_match": "YES | PARTIAL | NO", + "o_match": "YES | PARTIAL | NO", "keywords_have_synonyms": "YES | PARTIAL | NO", "keywords_count_sufficient": "YES | PARTIAL | NO", "study_design_matches_route": "YES | PARTIAL | NO", @@ -114,6 +132,10 @@ Top 文章的 key_sentences 字段是否有实质内容? "selection_count_appropriate": "YES | PARTIAL | NO", "key_sentences_present": "YES | PARTIAL | NO" }}, + "full_text_audit": {{ + "full_text_coverage": "GOOD | PARTIAL | NONE", + "key_sentences_present": "YES | PARTIAL | NO" + }}, "search_exhausted": false, "failures": ["具体失败项及原因(无失败则为空列表)"], "overall_quality": "pass | fail | gate_fail" diff --git a/src/config/prompts/judge/apply_judge.txt b/src/config/prompts/judge/apply_judge.txt index 992b77b..ee6e338 100644 --- a/src/config/prompts/judge/apply_judge.txt +++ b/src/config/prompts/judge/apply_judge.txt @@ -39,11 +39,21 @@ Apply 的维度一致性检查是否使用了与 route_type 匹配的框架? ## R2. strength_matches_evidence【Critical,权重3】 推荐强度是否与证据等级严格匹配? + +**判断依据**:以 Apply Agent 输出中的 `evidence_quality` 字段为准(该字段反映 Apply 实际采纳证据的质量,已排除人群不匹配或结局不相关的证据)。不要用 Appraise 阶段的全量 GRADE 分布覆盖此判断——Apply 可能合理地排除了部分证据。 + 注意:inconsistency=SERIOUS 时 Moderate→Weak 的降强推荐属正确行为,不应标注为不匹配。 -EBM原则:Strong需要High/Moderate直接证据;Weak适用于Low质量或结果不一致;Conditional适用于仅有间接证据;Consensus-based适用于仅有专家共识/指南。 -- YES:推荐强度与证据等级严格匹配(含上述特殊情况) + +EBM原则(基于 evidence_quality): +- evidence_quality=High/Moderate + 结果一致 → Strong 合理 +- evidence_quality=Low 或结果不一致 → Weak 合理 +- evidence_quality=Very Low + 结果一致 → Weak 合理(有明确 caveats) +- 仅间接证据 → Conditional 合理 +- 仅专家共识/指南 → Consensus-based 合理 + +- YES:推荐强度与 `evidence_quality` 严格匹配(含上述特殊情况) - PARTIAL:有轻微偏差(如 Moderate 证据给 Strong,但结果高度一致且无 inconsistency 问题),临床上可接受 -- NO:推荐强度与证据等级明显不符(不触发 gate 的中等程度不匹配) +- NO:推荐强度与 `evidence_quality` 明显不符(不触发 gate 的中等程度不匹配) ## R3. population_applicability_addressed【Major,权重2】 是否明确说明了证据人群与当前患者的匹配程度,包括可外推性或外推限制? @@ -80,6 +90,9 @@ EBM原则:Strong需要High/Moderate直接证据;Weak适用于Low质量或结 ```json {{ + "route_audit": {{ + "route_dimension_consistent": "YES | PARTIAL | NO | NA" + }}, "gate_results": {{ "recommendation_grounded_in_evidence": "YES | NO", "route_dimension_consistent": "YES | NO", @@ -98,3 +111,4 @@ EBM原则:Strong需要High/Moderate直接证据;Weak适用于Low质量或结 "overall_quality": "pass | fail | gate_fail" }} ``` + diff --git a/src/config/prompts/judge/appraise_judge.txt b/src/config/prompts/judge/appraise_judge.txt index f80e48d..422c54e 100644 --- a/src/config/prompts/judge/appraise_judge.txt +++ b/src/config/prompts/judge/appraise_judge.txt @@ -15,26 +15,75 @@ Appraise Agent 输出(包含分类标签和计算结果):{stage_output} ## G1. study_type_correct 所有研究的 study_type 识别是否正确? -- YES:所有研究的 study_type 识别正确 -- NO:存在明显错误(如将观察性研究标记为RCT) + +**判断优先级**:Appraise Agent 优先使用 PubMed `pub_types` 元数据(权威字段,由 PubMed 索引人员标注)。判断时应遵循以下规则: +- 若 Agent 的 study_type 与 `pub_types` 一致 → **必须判 YES**,即使摘要文字描述不够清晰 +- 若 `pub_types` 缺失,Agent 依据摘要文字推断 → 只要推断方向合理(如摘要有"randomized"字样标为RCT)即判 YES +- 判 NO 的条件:**摘要明确、直接地说明了与 pub_types 相反的设计**(如 pub_types 写 RCT,但摘要第一句写"This is an observational cohort study");或研究类型在 GRADE 合法范围之外 + +**常见合理情形(应判 YES)**: +- pub_types 含 `Randomized Controlled Trial` → Agent 标 RCT → YES(即使摘要提到电子病历数据) +- pub_types 含 `Meta-Analysis` → Agent 标 META_ANALYSIS → YES +- pub_types 缺失,摘要含 "cohort" → Agent 标 COHORT → YES + +- YES:所有研究的 study_type 识别符合上述规则 +- NO:存在明显错误,且摘要有直接相反证据(如 pub_types=RCT 但摘要明确写"观察性研究") ## G2. computed_grade_reasonable -系统计算出的最终GRADE等级(computed_grade)是否合理? -- YES:计算结果与基于摘要的独立判断一致 -- NO:明显不合理(通常是 study_type 或降级因素错误导致) +给定 Appraise Agent 输出的分类标签(study_type、included_study_type、各降级/升级因素),系统计算出的最终 GRADE 等级在数学逻辑上是否正确? + +**前提条件**:仅在 G1=YES(study_type 识别正确)的前提下才作为硬 Gate。若 G1=NO,则本项自动填 UNCERTAIN,不触发 Gate。 -注意:以下情况属于合理结果,不应判断为 NO: -- SR/MA 纳入观察性研究(included_study_type=OBSERVATIONAL)→ 初始分为 Low,即使无降级因素也可能输出 Low/Very Low -- COHORT/CASE_CONTROL 存在 SERIOUS 偏倚时,即使 large_effect=YES 也不升级 -- CROSS_SECTIONAL 无升级因素 → 最高只能到 Low +**核心原则:本项只验证"标签 → 等级"的数学计算路径,不重新评估降级因素本身是否合理。** +降级因素(risk_of_bias/indirectness/imprecision等)的合理性由 R1 负责,本项不重复判断。 + +判断方法:按以下公式验算:初始分(由study_type决定)- 降级分(SERIOUS=-1,VERY_SERIOUS=-2)+ 升级分(仅COHORT/CASE_CONTROL且无严重偏倚)= 最终等级。只要计算结果与输出等级一致,就判 YES。 + +- YES:给定这些标签,计算出的等级符合 GRADE 数学规则 +- NO:计算路径**明显**错误(如标签显示 RCT + 无任何降级因素,但输出 Very Low;或 COHORT 标了 large_effect=YES 且 risk_of_bias=NOT_SERIOUS,但等级未升级) +- UNCERTAIN:满足以下任一条件时使用,**降级为 MAJOR 问题,不触发 Gate**: + - 摘要截断,无法确认 study_type 或关键降级因素 + - 研究设计存在歧义(如"target trial emulation"、SR内含NMA等混合设计) + - 降级因素标注本身存在争议(如 risk_of_bias 的严重程度有合理分歧),导致预期等级范围不确定 + +重要:以下情况均属于计算正确,必须判断为 YES: +- RCT 因 indirectness=SERIOUS 降级后输出 Low 或 Very Low → 正确 +- SR/MA 因 included_study_type=OBSERVATIONAL 初始分为 Low,再降级后输出 Very Low → 正确 +- SR/MA 因 included_study_type=UNKNOWN 初始分为 Moderate,降级后输出 Low/Very Low → 正确 +- COHORT/CASE_CONTROL 存在 SERIOUS 偏倚时升级因素被阻断,等级未提升 → 正确 +- CROSS_SECTIONAL 无升级因素,最高只能到 Low → 正确 +- NMA 因 indirectness=SERIOUS 降级 → 正确 # Rubric 评分项 ## R1. downgrade_factors_appropriate【Critical,权重3】 四个降级因素(risk_of_bias/inconsistency/indirectness/imprecision)的严重程度标注是否与摘要信息相符? -- YES:各因素的严重程度标签(NOT_SERIOUS/SERIOUS/VERY_SERIOUS)与摘要信息相符 -- PARTIAL:整体合理,但个别因素评估过于宽松或严苛 -- NO:存在明显错误(如未盲法 RCT 标记为 NOT_SERIOUS 偏倚风险) + +**GRADE标准下的合理行为(判YES或PARTIAL,不判NO):** +- 摘要截断导致无法确认效应量/CI/样本量 → 标注 `imprecision=SERIOUS` 是合理的保守判断 +- 单篇研究标注 `inconsistency=NA` 是**唯一正确**做法(见下方强制规则) +- NARRATIVE_REVIEW/EXPERT_OPINION 各因素标注 `NA` 是正确做法 +- 大型双盲安慰剂对照 RCT 标注 `risk_of_bias=NOT_SERIOUS` 是正确做法(见下方豁免规则) + +**risk_of_bias 豁免规则**:摘要已明确描述"double-blind, placebo-controlled"或"randomized, double-blind",或大型多中心 RCT(>1000人)且明确双盲/安慰剂对照 → 标注 `NOT_SERIOUS` 正确,不应因"未详述分配隐藏方法"而判为错误。 + +**inconsistency 强制规则**:单篇独立研究(RCT/COHORT/CASE_CONTROL/CROSS_SECTIONAL)的 `inconsistency` **必须为 NA**。若 Agent 对单篇研究标注了 NOT_SERIOUS/SERIOUS/VERY_SERIOUS,这是错误的,应判 NO。 + +**indirectness 判断基准**:只评估**该研究本身**的 PICO 与目标 PICO 的匹配程度。不得将"证据列表中其他研究 PICO 不同"作为该研究 indirectness 的判断依据。 + +**判 NO 的条件(明显违反GRADE标准):** +- 摘要明确描述了双盲/安慰剂对照,仍标注 `risk_of_bias=SERIOUS/VERY_SERIOUS` +- 单篇 RCT/COHORT 等研究的 `inconsistency` 标注为 NOT_SERIOUS/SERIOUS/VERY_SERIOUS(应为 NA) +- 将证据列表中其他研究的 PICO 差异作为本研究 `indirectness` 降级的理由 +- 升级因素(large_effect、dose_response)被错误地用于 RCT 或 SR/MA(应填 NA) + +**判 PARTIAL 的条件:** +- 整体合理,但个别因素评估严苛程度有轻微不一致 +- 保守降级的理由可接受,但在部分研究上依据偏弱 + +- YES:各因素标注符合GRADE标准 +- PARTIAL:整体合理,个别因素有轻微偏差 +- NO:存在明显违反上述规则的错误 ## R2. included_study_type_correct【Critical,权重3】 (仅当证据列表含 SYSTEMATIC_REVIEW/META_ANALYSIS/NMA 时判断,否则填 NA) @@ -78,7 +127,7 @@ SR/MA/NMA 的 included_study_type 字段是否与摘要描述的纳入研究类 {{ "gate_results": {{ "study_type_correct": "YES | NO", - "computed_grade_reasonable": "YES | NO" + "computed_grade_reasonable": "YES | NO | UNCERTAIN" }}, "rubric_results": {{ "downgrade_factors_appropriate": "YES | PARTIAL | NO", @@ -92,3 +141,4 @@ SR/MA/NMA 的 included_study_type 字段是否与摘要描述的纳入研究类 "overall_quality": "pass | fail | gate_fail" }} ``` + diff --git a/src/config/prompts/judge/ask_judge.txt b/src/config/prompts/judge/ask_judge.txt index dfb1510..38a5484 100644 --- a/src/config/prompts/judge/ask_judge.txt +++ b/src/config/prompts/judge/ask_judge.txt @@ -21,12 +21,14 @@ route_type 与问题类型是否匹配? - NA:route_type = direct_answer,不适用 ## G3. nonresearch_classification_correct(仅当 route_type = direct_answer 时判断,否则填 NA) -以下三条触发条件是否全部满足? -1. 问题要求立即操作性指导(动词:如何处理/立即给/紧急处置) -2. 延迟回答会直接危及患者生命安全 -3. 答案来自已有公认标准流程(BLS/ACLS/指南操作章节) -- YES:三条均满足 -- NO:任一条不满足(应重路由到 EBM 流程) +判断 direct_answer 路由是否合适——即问题是否确实不需要文献检索来回答。 +满足以下任一情形即视为合适: +1. **事实/定义性问题**:答案来自已建立的医学知识(如"LVEF 代表什么"、"血清钠正常范围"、"GRADE 等级有哪些") +2. **药物剂量 / 药代动力学 / 已确立的临床阈值**:查询类问题,答案为公认数值 +3. **紧急操作性指导**:要求立即操作(如何处理/立即给/紧急处置),延迟回答直接危及生命,答案来自公认标准流程(BLS/ACLS/指南操作章节) +4. **明确超出临床 EBM 范畴**:管理、计费、非临床问题 +- YES:问题属于以上任一类型,文献检索不会增加额外价值 +- NO:问题实际上需要循证评估(治疗有效性、诊断准确性、预后、危害、预防等),应重路由到 EBM 流程 - NA:route_type != direct_answer,不适用 # Rubric 评分项(仅适用于 EBM 路由;direct_answer 路由时所有 rubric 填 NA) diff --git a/src/config/prompts/judge/assess_judge.txt b/src/config/prompts/judge/assess_judge.txt index 38078c4..8a448d5 100644 --- a/src/config/prompts/judge/assess_judge.txt +++ b/src/config/prompts/judge/assess_judge.txt @@ -11,14 +11,14 @@ - 证据质量分布: {grade_distribution} - 最终推荐: {recommendation} -## Assess Agent 输出 -{stage_output} - ## 路由信息 - 路由类型:{route_type} - 路由置信度:{route_confidence} - EBM查询描述:{ebm_query_description} +## Assess Agent 输出 +{stage_output} + # Audit Task ## 1. 回答完整性审计 @@ -36,10 +36,19 @@ ## 2. 推理链完整性审计 逐段检查推理链的连接质量。 -**ask_to_acquire_link**:Ask阶段的PICO是否有效指导了Acquire阶段的检索? -- `CLEAR`:检索策略直接来源于PICO,关键词与P/I/O要素对应明确;且检索词覆盖了 {route_type} 对应的关键维度(ebm_pico→P+I+O;ebm_pird→P+IndexTest+RefStd;ebm_peo→P+Exposure+O;ebm_prognosis→P+PrognosticFactor+O) -- `WEAK`:关联存在但不够紧密,检索词覆盖了PICO的主要方面但有跳跃,或遗漏了 {route_type} 特定维度中的某个关键要素 -- `BROKEN`:检索策略与PICO脱节,检索了与PICO无关的主题 +**ask_to_acquire_link**:Ask阶段的结构化查询是否有效指导了Acquire阶段的检索? +各 route_type 对应的审计重点: +- ebm_pico: 关键词是否覆盖 Patient + Intervention + Outcome +- ebm_pird: 关键词是否覆盖 Patient + Index Test + Target Condition +- ebm_peo: 关键词是否覆盖 Patient + Exposure + Outcome +- ebm_prognosis: 关键词是否覆盖 Patient + Prognostic Factor + Outcome +- diagnostic_reasoning: 关键词是否覆盖 Clinical Presentation + 鉴别诊断方向 +- direct_answer: 不经过 Acquire 阶段,此项标注为 NA + +- `CLEAR`:检索策略直接来源于结构化查询,关键词与对应框架维度对应明确 +- `WEAK`:关联存在但不够紧密,检索词覆盖了主要维度但存在跳跃或遗漏 +- `BROKEN`:检索策略与结构化查询脱节,检索了完全无关的主题 +- `NA`:route_type 为 direct_answer,不适用 **acquire_to_appraise_link**:Acquire阶段获取的证据是否被Appraise阶段正确评价? - `CLEAR`:评价了所有关键证据,GRADE评级与检索到的研究类型对应可追溯 @@ -77,18 +86,20 @@ {{ "completeness_audit": {{ "original_question_answered": "YES | PARTIAL | NO", - "evidence_limitations_stated": "YES | NO | NA" + "evidence_limitations_stated": "YES | NO | NA", + "route_confidence_noted": "YES | NO | NA" }}, "chain_audit": {{ - "ask_to_acquire_link": "CLEAR | WEAK | BROKEN", + "ask_to_acquire_link": "CLEAR | WEAK | BROKEN | NA", "acquire_to_appraise_link": "CLEAR | WEAK | BROKEN", "appraise_to_apply_link": "CLEAR | WEAK | BROKEN" }}, "consistency_audit": {{ "grade_to_strength_consistent": "YES | MINOR_ISSUE | MAJOR_CONTRADICTION", - "no_internal_contradictions": "YES | MINOR_ISSUE | MAJOR_CONTRADICTION", - "route_confidence_noted": "YES | NO | NA" + "no_internal_contradictions": "YES | MINOR_ISSUE | MAJOR_CONTRADICTION" }}, - "reasoning": "一句话说明整体推理链质量及主要问题" + "failures": ["具体失败项及原因(无失败则为空列表)"], + "overall_quality": "pass | fail | degraded" }} ``` + diff --git a/src/config/prompts/scheduling_llm.txt b/src/config/prompts/scheduling_llm.txt index fb72ec2..32a1390 100644 --- a/src/config/prompts/scheduling_llm.txt +++ b/src/config/prompts/scheduling_llm.txt @@ -39,6 +39,8 @@ - 问题出在**当前阶段的执行**(如输出格式错误、LLM暂时偏差)?→ 候选动作:`retry_current` - 问题出在**当前阶段的输入**(如PICO不准确导致检索方向错误)?→ 候选动作:`backtrack_to_上游阶段` - 问题属于**轻微瑕疵**(Minor且评分通过)?→ 候选动作:`proceed` +- 问题属于**呈现/格式缺陷**(如 citation_traceable、recommendation_specific、patient_preference_considered)?→ 候选动作:`retry_current`,**绝不 backtrack 到 Acquire 或更早** +- **Acquire 找到了文章但证据不够精准/间接性高**?→ 候选动作:`proceed`,**绝不 backtrack_to_acquire**(见下方核心原则) **动作语义的关键区分**: @@ -49,6 +51,26 @@ **最小充分回退原则**:回退目标应是造成问题的**最近上游阶段**,而非更远的阶段。只有当问题的根源确实在更早的阶段时,才考虑跨越多个阶段回退。 +--- + +### ★ backtrack_to_acquire 的严格触发条件(核心原则) + +`backtrack_to_acquire` **只在以下两种情况下允许使用**: + +1. **Acquire 返回 0 篇结果**:检索完全失败,没有任何文章可以评价 +2. **检索到的证据与临床问题完全无关**:疾病/干预与 PICO 根本不同(如问心脏病却检索到骨科文献) + +**以下情况严禁 backtrack_to_acquire,应选择 proceed**: + +- Acquire 找到了文章(哪怕只有 1 篇),但证据质量低、间接性高、研究设计不理想 +- 证据人群与 PICO 人群有差异(如稳定型 vs 急性期)但属于同一疾病 +- 缺少高级别研究(无 SR/RCT),但有观察性研究 +- study_design_matches_route=PARTIAL:说明证据存在但研究设计不完全匹配 + +**理由**:证据间接性和质量不理想是 GRADE 框架中 `indirectness` 和 `imprecision` 降级因素的处理范畴,由 Appraise 打分、Apply 在推荐强度和 caveats 中如实表达(Weak/Conditional 推荐),而不是拒绝推荐。重新检索无法凭空创造不存在的高质量证据,只会浪费预算。 + +--- + ### Step 2:计算回退代价 每次回退都意味着重新执行从目标阶段到当前阶段的所有步骤。请估算代价: @@ -81,6 +103,34 @@ #### low_confidence_data(数值置信度低) MVP阶段属正常,不自动触发回退,可作为输出的注意事项。 +#### appraise_gate_fail(Appraise阶段Gate失败) +当 Appraise 阶段出现 CRITICAL 问题(如 computed_grade_reasonable=NO 或 study_type_correct 错误)时: +- **应选择 `retry_current`**:这是 Appraise 内部的执行问题(LLM对研究类型或GRADE因素的判断偏差),重试通常能修正。 +- **不应选择 `backtrack_to_acquire`**:证据本身没有问题,重新检索无法解决评价逻辑错误。 +- 只有当 Appraise 连续失败3次以上,且每次失败原因相同,才考虑 `backtrack_to_acquire`。 + +#### apply_presentation_issues(Apply 输出呈现类问题) +以下问题属于 **Apply Agent 的输出格式/呈现缺陷**,根本原因在 Apply 内部,与上游证据无关。 +无论这些问题被 Judge 标注为 MAJOR 还是 MINOR,**只允许 `retry_current`,严禁 `backtrack_to_acquire` 或更早阶段**: + +- `citation_traceable=NO`:Apply 使用了内部编号("证据1/2/3")而非 PMID/标题,不代表证据不存在或来源不可靠,重试时提示 Apply 写出 PMID 即可修复。 +- `recommendation_specific=NO/PARTIAL`:推荐内容过于模糊,是 Apply 的表述问题,不是证据问题,重试可修复。 +- `patient_preference_considered=NO`:Apply 遗漏了患者偏好讨论,是输出完整性问题,重试可修复。 + +**误判识别**:如果你看到 Apply 阶段出现上述问题,并且你倾向于 `backtrack_to_acquire`,请停止——你正在把"Apply 没写清楚"误读成"证据不足"。 + +#### technical_limitation(技术限制类问题) +以下问题由技术或数据可得性限制导致,**重新检索无法修复,应选择 `proceed`**: + +- `key_sentences_present=NO`(Acquire Judge):RAG 未能提取关键句,通常因为对应文章没有 PMC 开放全文。重新检索同一批文章不会改变 PMC 开放获取状态,应直接 proceed。 +- `study_design_matches_route=PARTIAL/NO` 且 Acquire 已重试 ≥2 次:该临床问题领域本身可能缺乏高级别研究(无 SR/诊断准确性研究等),重新检索不能凭空创造不存在的证据,应 proceed 让 Apply 如实输出 Insufficient Evidence。 + + +- **根因在 Appraise**:LLM 对 risk_of_bias / indirectness / imprecision 等因素的严重程度判断有误,导致 GRADE 等级偏高或偏低。 +- **应选择 `retry_current`**:重新运行 Appraise,让 LLM 重新评估降级因素。 +- **不应选择 `backtrack_to_apply`**:Apply 的输出(如 evidence_quality 偏高)是 Appraise 错误的下游症状,不是 Apply 本身的问题;重跑 Apply 无法修正上游的 GRADE 计算错误。 +- 只有当 Appraise 连续重试3次以上仍然失败,才考虑 `backtrack_to_acquire`(检索到的证据本身可能不适合回答该问题)。 + ### Step 5:综合判断 结合以上分析,参考决策矩阵: @@ -104,8 +154,8 @@ MVP阶段属正常,不自动触发回退,可作为输出的注意事项。 1. **proceed** - 前进到下一阶段(Ask→Acquire→Appraise→Apply→Assess) 2. **backtrack_to_ask** - 回退到Ask阶段(重新提取PICO;通常只在PICO根本性错误时选择) 3. **backtrack_to_acquire** - 回退到Acquire阶段(重新检索;上游PICO正确但检索方向错误) -4. **backtrack_to_appraise** - 回退到Appraise阶段(重新评价证据;评价有根本性错误) -5. **backtrack_to_apply** - 回退到Apply阶段(重新生成推荐;推荐与证据不符) +4. **backtrack_to_appraise** - 回退到Appraise阶段(重新评价证据;评价有根本性错误,如降级因素标注系统性偏差) +5. **backtrack_to_apply** - 回退到Apply阶段(**仅从 Assess 阶段使用**;从 Apply 阶段重跑 Apply 请用 `retry_current`,不要用此动作,否则会被计入死循环计数) 6. **retry_current** - 原地重试当前阶段(修正执行,不修改上游输入) 7. **terminate** - 终止workflow(证据不足、检索穷尽、无法继续等情况) 8. **request_human_review** - 请求人类审核 @@ -116,17 +166,12 @@ MVP阶段属正常,不自动触发回退,可作为输出的注意事项。 ```json {{ - "reasoning": "按Step 1-5的框架阐述推理: - 1. 问题根源分析(执行问题 vs 输入问题) - 2. 回退代价估算(需要重跑几步,剩余预算够吗) - 3. 历史策略检查(是否重复过同样策略) - 4. 特殊信号处理(如有) - 5. 最终决策依据", + "reasoning": "1-2句话说明核心决策依据(根因在哪里、选择该动作的理由)", "action": "proceed | backtrack_to_ask | backtrack_to_acquire | backtrack_to_appraise | backtrack_to_apply | retry_current | terminate | request_human_review", "parameters": {{ - "adjust_strategy": "具体调整策略(引用issues中的description作为修改指令)", + "adjust_strategy": "具体调整指令(引用issues中的description)", "focus_on": "需要重点关注的维度", // 仅在 request_human_review 时补充: @@ -141,10 +186,9 @@ MVP阶段属正常,不自动触发回退,可作为输出的注意事项。 ## 决策要求 -- **代价优先**:在质量相近的情况下,优先选择代价更小的动作(retry > backtrack_近 > backtrack_远) -- **参考决策矩阵**:Minor问题且通过评估时通常应proceed,避免过度保守 -- **引用修改建议**:adjust_strategy字段应引用issues中的description,确保Agent收到具体的修复指令 -- **避免无效重复**:多次回退到同一阶段无改善时,升级策略或终止 -- 你的reasoning将被记录用于审计和系统改进,请清晰阐述决策逻辑 +- **代价优先**:retry > backtrack_近 > backtrack_远 +- **Minor问题且通过评估**:直接proceed +- **adjust_strategy**:引用issues中的description作为具体修复指令 +- **避免无效重复**:多次回退同一阶段无改善时升级策略或终止 -请开始推理和决策: +请直接输出JSON,不要添加额外说明: diff --git a/src/coordinator/coordinator.py b/src/coordinator/coordinator.py index a5bb16a..729d2a7 100644 --- a/src/coordinator/coordinator.py +++ b/src/coordinator/coordinator.py @@ -12,6 +12,17 @@ from src.scheduling.scheduling_llm import SchedulingLLM +def _build_feedback(decision: SchedulingDecision) -> str: + """Combine reasoning + adjust_strategy into a single feedback string for agents.""" + parts = [decision.reasoning] + params = decision.parameters or {} + if params.get("adjust_strategy"): + parts.append(f"具体修改指令:{params['adjust_strategy']}") + if params.get("focus_on"): + parts.append(f"重点关注:{params['focus_on']}") + return "\n".join(parts) + + class Coordinator: """Central coordinator for the EBM 5A workflow""" @@ -146,6 +157,28 @@ def handle_scheduling_decision( # Extract target stage target_stage = action.replace("backtrack_to_", "").capitalize() + # Guard: backtrack_to_acquire is only legitimate when Acquire returned 0 results + # OR evidence is completely unrelated to the clinical question. + # If evidence exists (even low quality / indirect), proceed instead — GRADE handles + # indirectness via downgrade factors in Appraise, not by refusing to continue. + if target_stage == "Acquire": + evidence_list = state.get("evidence_list") or [] + acquire_backtracks = sum( + 1 for bt in state.get("backtrack_history", []) + if bt.get("to_stage") == "Acquire" + ) + if len(evidence_list) > 0 and acquire_backtracks >= 1: + # Already backtracked to Acquire once and still found evidence — proceed instead + print( + f"[GUARD] backtrack_to_acquire overridden → proceed " + f"(evidence_list has {len(evidence_list)} items, " + f"already backtracked {acquire_backtracks}x to Acquire)" + ) + decision.action = "proceed" + next_step = self.route_next(state) + state["current_step"] = next_step + return state + # Record backtrack state["backtrack_history"].append( { @@ -157,15 +190,15 @@ def handle_scheduling_decision( ) state["current_step"] = target_stage - state["backtrack_reason"] = decision.reasoning + state["backtrack_reason"] = _build_feedback(decision) elif action == "retry_current": # Stay on current stage, will be re-executed - state["backtrack_reason"] = decision.reasoning + state["backtrack_reason"] = _build_feedback(decision) elif action == "terminate": state["should_terminate"] = True - state["backtrack_reason"] = decision.reasoning + state["backtrack_reason"] = _build_feedback(decision) elif action == "request_human_review": # Record human intervention request @@ -250,13 +283,119 @@ def execute_workflow(self, question: str) -> WorkflowState: state["current_step"] = None break + # Presentation-only issue dimensions in Apply that are always retry_current + # (pure formatting defects — backtracking upstream cannot fix them). + _APPLY_PRESENTATION_DIMS = frozenset({ + "citation_traceable", + "recommendation_specific", + "patient_preference_considered", + }) + + # Fast-path Rule 3: fail_threshold but same stage already retried ≥3 times + # → stop cycling. Acquire with 0 results → backtrack to Ask once to broaden + # PICO; if Ask already received that hint and Acquire still empty, terminate. + current_stage_retries = state["agent_call_counts"].get(current_step, 0) - 1 + if not current_observe.evaluation.pass_threshold and current_stage_retries >= 3: + if current_step == "Acquire" and not state.get("evidence_list"): + ask_broaden_backtracks = sum( + 1 for bt in state.get("backtrack_history", []) + if bt.get("to_stage") == "Ask" + and "broaden_pico" in (bt.get("reason") or "") + ) + if ask_broaden_backtracks == 0: + decision = SchedulingDecision( + reasoning=( + "broaden_pico:Acquire 已 4 次空结果,疑似 PICO 过窄。" + "回退到 Ask 拓宽检索词(去除合并症细节、加同义词、放宽研究类型至 case report / guideline)。" + ), + action="backtrack_to_ask", + parameters={ + "adjust_strategy": ( + "Acquire 4 次返回 0 篇文献,请重写 PICO:" + "①保留核心病种与治疗意图,去除次要合并症;" + "②补充英文/MeSH 同义词与上位词;" + "③放宽 study_type 接受范围(允许 case report / guideline / observational)。" + ), + }, + ) + print( + f"[FAST-PATH-3] Acquire empty {current_stage_retries + 1}x — backtracking to Ask to broaden PICO." + ) + else: + decision = SchedulingDecision( + reasoning=( + f"自动终止(Acquire空结果上限):Acquire 已执行 " + f"{current_stage_retries + 1} 次仍未检索到任何证据,且 Ask " + "已收到 broaden_pico 提示,终止流程并由 Apply 报 Insufficient Evidence。" + ), + action="terminate", + parameters=None, + ) + print( + f"[FAST-PATH-3] Acquire empty after Ask already broadened — terminating." + ) + else: + decision = SchedulingDecision( + reasoning=( + f"自动前进(重试上限规则):{current_step} 阶段已执行 " + f"{current_stage_retries + 1} 次仍未通过,继续重试无收益,强制前进。" + ), + action="proceed", + parameters=None, + ) + print( + f"[FAST-PATH-3] Stage {current_step}: retried {current_stage_retries} times without passing — force proceed." + ) + + # Fast-path Rule 4: Apply stage, all non-passing issues are presentation-only + # → always retry_current, never backtrack upstream. + elif current_step == "Apply" and not current_observe.evaluation.pass_threshold: + failing_dims = frozenset( + getattr(issue, "dimension", "") + for issue in current_observe.evaluation.issues + if getattr(issue, "severity", "") in ("critical", "major", "minor") + and "PARTIAL" not in getattr(issue, "description", "") + ) + if failing_dims and failing_dims.issubset(_APPLY_PRESENTATION_DIMS): + decision = SchedulingDecision( + reasoning=( + "自动重试(呈现类规则):Apply阶段失败维度全为呈现/格式问题 " + f"({', '.join(sorted(failing_dims))}),根因在Apply内部,重试可修复,禁止回退上游。" + ), + action="retry_current", + parameters={"adjust_strategy": "请使用PMID而非内部编号标注文献,确保推荐足够具体,并纳入患者偏好讨论。"}, + ) + print( + f"[FAST-PATH-4] Apply: all failures are presentation-only ({', '.join(sorted(failing_dims))}) — auto retry_current." + ) + else: + # Fall through to normal LLM scheduling below + decision = None + else: + decision = None + + # Fast-path Rule 5: soft_gate acquire_search_exhausted — proceed, let Apply report insufficient evidence + if decision is None and "acquire_search_exhausted" in soft_gate_signals: + if current_step == "Acquire": + decision = SchedulingDecision( + reasoning=( + "自动前进(检索穷尽规则):acquire_search_exhausted信号触发," + "该临床领域文献有限,继续检索无益,由Apply输出Insufficient Evidence。" + ), + action="proceed", + parameters=None, + ) + print("[FAST-PATH-5] acquire_search_exhausted — auto-proceed to Appraise.") + # Fast-path Rule 1: if score passes threshold AND all issues are Minor (no critical/major), # skip Scheduling LLM entirely and auto-proceed — matches the mandatory scheduling rule. has_critical_or_major = any( getattr(issue, "severity", "") in ("critical", "major") for issue in current_observe.evaluation.issues ) - if current_observe.evaluation.pass_threshold and not has_critical_or_major: + if decision is not None: + pass # already decided above + elif current_observe.evaluation.pass_threshold and not has_critical_or_major: decision = SchedulingDecision( reasoning="自动前进(快速规则):所有问题均为Minor级别且评分通过阈值,无需LLM决策。", action="proceed", @@ -266,58 +405,73 @@ def execute_workflow(self, question: str) -> WorkflowState: f"[FAST-PATH] Stage {current_step} passed with no critical/major issues — auto-proceeding." ) - elif current_observe.evaluation.pass_threshold and has_critical_or_major: - # Fast-path Rule 2: pass_threshold=True but the current major issue - # dimension set has appeared in ANY previous attempt of the same stage - # → the retry feedback loop is cycling and cannot improve → auto-proceed. - current_major_dims = frozenset( - getattr(issue, "dimension", "") + elif decision is None and current_observe.evaluation.pass_threshold and has_critical_or_major: + # Fast-path Rule 2a: all major/critical issues are PARTIAL (not NO/missing) + # → score passed threshold and no hard failures → auto-proceed. + all_partial = all( + "PARTIAL" in getattr(issue, "description", "") for issue in current_observe.evaluation.issues if getattr(issue, "severity", "") in ("critical", "major") ) - # Collect all previous same-stage observations (excluding current) - prev_same_stage_obs = [ - obs - for obs in state["observe_history"][:-1] - if getattr(obs, "stage", None) == current_step - ] - - # Check whether current dims appeared in ANY prior same-stage attempt - dims_seen_before = any( - current_major_dims - == frozenset( - getattr(issue, "dimension", "") - for issue in obs.evaluation.issues - if getattr(issue, "severity", "") in ("critical", "major") - ) - for obs in prev_same_stage_obs - ) - - if dims_seen_before: + if all_partial: decision = SchedulingDecision( reasoning=( - "自动前进(循环Major规则):当前Major问题维度组合在本阶段历史尝试中已出现过," - "重试无法改善,且分数已通过阈值,直接前进。" + "自动前进(PARTIAL规则):所有Major/Critical问题均为PARTIAL(部分通过)," + "分数已通过阈值,重试无法显著改善,直接前进。" ), action="proceed", parameters=None, ) print( - f"[FAST-PATH-2] Stage {current_step}: cycling major issues " - f"({', '.join(sorted(current_major_dims))}) seen before — auto-proceeding." + f"[FAST-PATH-2a] Stage {current_step}: all major issues are PARTIAL — auto-proceeding." ) else: - t0 = time.time() - decision = self.scheduling_llm.make_decision( - observe=current_observe, - state=state, - soft_gate_signals=soft_gate_signals, + # Fast-path Rule 2b: same major dimension set seen before → cycling → auto-proceed. + current_major_dims = frozenset( + getattr(issue, "dimension", "") + for issue in current_observe.evaluation.issues + if getattr(issue, "severity", "") in ("critical", "major") ) - print( - f"[TIMING] Scheduling ({current_step}): {time.time()-t0:.1f}s" + prev_same_stage_obs = [ + obs + for obs in state["observe_history"][:-1] + if getattr(obs, "stage", None) == current_step + ] + dims_seen_before = any( + current_major_dims + == frozenset( + getattr(issue, "dimension", "") + for issue in obs.evaluation.issues + if getattr(issue, "severity", "") in ("critical", "major") + ) + for obs in prev_same_stage_obs ) - else: + if dims_seen_before: + decision = SchedulingDecision( + reasoning=( + "自动前进(循环Major规则):当前Major问题维度组合在本阶段历史尝试中已出现过," + "重试无法改善,且分数已通过阈值,直接前进。" + ), + action="proceed", + parameters=None, + ) + print( + f"[FAST-PATH-2] Stage {current_step}: cycling major issues " + f"({', '.join(sorted(current_major_dims))}) seen before — auto-proceeding." + ) + else: + t0 = time.time() + decision = self.scheduling_llm.make_decision( + observe=current_observe, + state=state, + soft_gate_signals=soft_gate_signals, + ) + print( + f"[TIMING] Scheduling ({current_step}): {time.time()-t0:.1f}s" + ) + + elif decision is None: # fail_threshold → Make scheduling decision using Scheduling LLM t0 = time.time() decision = self.scheduling_llm.make_decision( diff --git a/src/coordinator/gate_engine.py b/src/coordinator/gate_engine.py index 8498ffd..487dbca 100644 --- a/src/coordinator/gate_engine.py +++ b/src/coordinator/gate_engine.py @@ -117,47 +117,6 @@ def check_evidence_insufficiency_gate(state: WorkflowState) -> Optional[GateTrig }, ) - # Scenario 2: Appraise stage found ALL evidence is Very Low quality AND no - # systematic reviews were found. This is a much stricter threshold than the - # old 80% rule: for rare diseases the evidence base is inherently low quality, - # so we should still proceed to Apply/Assess and produce a recommendation - # (with appropriate caveats). Only hard-fail when there is literally nothing - # of value AND we have tried to acquire evidence multiple times. - if state["current_step"] == "Appraise": - appraisal = state.get("appraisal_results") - acquire_attempts = state["agent_call_counts"].get("Acquire", 0) - if appraisal and appraisal.evidence and acquire_attempts >= 2: - grade_distribution: dict = {} - for e in appraisal.evidence: - if e.grade_level: - grade_distribution[e.grade_level] = ( - grade_distribution.get(e.grade_level, 0) + 1 - ) - - total_evidence = sum(grade_distribution.values()) - very_low_count = grade_distribution.get("Very Low", 0) - has_better_evidence = ( - total_evidence > very_low_count - ) # at least one non-VL article - - # Only terminate if 100% Very Low AND at least 5 articles appraised - # AND we have already tried Acquire twice — meaning re-searching won't help - if ( - total_evidence >= 5 - and not has_better_evidence - and very_low_count / total_evidence == 1.0 - ): - return GateTrigger( - gate_name="insufficient_evidence_quality", - reason="所有证据质量均为Very Low(经多次检索后确认)", - suggested_action="terminate", - output_message={ - "status": "evidence_quality_insufficient", - "message": "现有证据质量极低(Very Low),无法支持可靠的临床推荐。建议等待更高质量的研究发表。", - "grade_distribution": grade_distribution, - }, - ) - return None diff --git a/src/judge/judge_llm.py b/src/judge/judge_llm.py index 47108d2..d1df7ae 100644 --- a/src/judge/judge_llm.py +++ b/src/judge/judge_llm.py @@ -17,7 +17,8 @@ "Acquire": { "keywords_cover_pico_dimensions": (3, True), "primary_focus_match": (3, True), - "outcome_match": (3, True), + "p_match": (3, True), + "o_match": (3, True), "keywords_have_synonyms": (2, True), "keywords_count_sufficient": (2, True), "study_design_matches_route": (2, True), @@ -198,6 +199,36 @@ def _appraise_layer1_check(output: Dict) -> Dict: # --------------------------------------------------------------------------- +def _precheck_ask(pico_dict: dict) -> dict: + """Pre-check Ask output for basic keyword quality before LLM judge call.""" + import re + chinese = re.compile(r'[一-鿿]') + keywords = pico_dict.get("keywords", []) + keywords_english = not any(chinese.search(kw) for kw in keywords) + has_synonyms = len(set(keywords)) >= 2 + keyword_count_ok = len(keywords) > 1 + return { + "keywords_english_medical": "YES" if keywords_english else "NO", + "has_synonyms_or_mesh": "YES" if has_synonyms else "NO", + "keyword_count_ok": keyword_count_ok, + } + + +def _derive_routing_decision(audit: dict, pass_threshold: bool, + retry_count: int, max_retry: int = 2) -> str: + """Derive routing decision from Ask judge audit without LLM output.""" + gate_results = audit.get("gate_results", {}) + intent_ok = gate_results.get("intent_not_distorted") != "NO" + route_ok = gate_results.get("route_correct") != "NO" + if not intent_ok: + return "retry_structure" if retry_count < max_retry else "fallback" + if not route_ok: + return "retry_route" if retry_count < max_retry else "fallback" + if pass_threshold: + return "proceed" + return "retry_structure" if retry_count < max_retry else "fallback" + + def _score_ask(audit: Dict) -> Tuple[Dict[str, Any], List[Dict], bool, str]: """Gate + Rubric scoring for Ask stage.""" gate_failures = _check_gates("Ask", audit) @@ -349,504 +380,146 @@ def _score_ask_legacy(audit: Dict) -> Tuple[Dict[str, float], List[Dict], bool, def _score_acquire(audit: Dict) -> Tuple[Dict[str, float], List[Dict], bool, str]: - """Convert Acquire audit classifications to dimension scores and issues.""" + """Convert Acquire audit classifications to dimension scores and issues. + + Reads the Gate+Rubrics format produced by acquire_judge.txt: + gate_results.search_terms_valid + rubric_results.{keywords_cover_pico_dimensions, primary_focus_match, ...} + overall_quality: pass | fail | gate_fail + failures: [...] + """ issues: List[Dict] = [] - search_audit = audit.get("search_audit", {}) - evidence_audit = audit.get("evidence_audit", {}) + gate_results = audit.get("gate_results", {}) search_exhausted = bool(audit.get("search_exhausted", False)) - # --- Invalid search terms: circuit breaker --- - if search_audit.get("search_terms_valid") == "NO": - issues.append( - { - "severity": "critical", - "dimension": "evidence_potency", - "description": ( - "检索词构建有误,检索方向完全错误。" - "请根据PICO重新设计检索策略,使用正确的英文医学术语(MeSH词)。" - ), - } - ) - return ( - {"evidence_potency": 0.0, "evidence_hierarchy": 0.0, "pico_relevance": 0.0}, - issues, - False, - "检索词严重错误,无法获取有效证据。", - ) - - # --- Evidence hierarchy --- - type_scores = { - "SR_META": 1.0, - "RCT": 0.80, - "COHORT": 0.55, - "CASE_CONTROL": 0.35, - "CASE_REPORT": 0.15, - "NONE": 0.0, - } - best_type = evidence_audit.get("best_study_type", "NONE") - evidence_hierarchy = type_scores.get(best_type, 0.0) - - # --- Evidence potency = hierarchy × ability to answer PICO --- - answers_map = {"YES": 1.0, "PARTIAL": 0.6, "NO": 0.2} - answers_val = evidence_audit.get("best_evidence_answers_pico", "NO") - evidence_potency = evidence_hierarchy * answers_map.get(answers_val, 0.2) - - # --- PICO relevance = average of P / I / O match --- - pico_match_map = {"YES": 1.0, "PARTIAL": 0.5, "NO": 0.0} - p_score = pico_match_map.get(evidence_audit.get("pico_p_match", "NO"), 0.0) - i_score = pico_match_map.get(evidence_audit.get("pico_i_match", "NO"), 0.0) - o_score = pico_match_map.get(evidence_audit.get("pico_o_match", "NO"), 0.0) - pico_relevance = (p_score + i_score + o_score) / 3 - - # --- Issue generation --- - if best_type == "NONE" and not search_exhausted: - # Reaching here means search_terms_valid=YES (circuit breaker already returned for NO). - # NONE results with valid terms = API/network error or genuinely empty literature. - # Use "major" (not "critical") and advise retrying with same terms. - issues.append( - { - "severity": "major", - "dimension": "evidence_potency", - "description": ( - "检索返回零结果,但检索词已确认有效(可能是API网络错误或临时故障)。" - "请保持原检索词直接重试,不要改变搜索策略。" - ), - } - ) - elif best_type == "CASE_REPORT": - issues.append( - { - "severity": "major", - "dimension": "evidence_hierarchy", - "description": ( - f"找到的最高质量证据仅为病例报告({best_type})。" - "建议尝试更宽泛或不同的检索词以寻找更高层级的证据(RCT或SR)。" - ), - } - ) - - if answers_val == "NO" and not search_exhausted: - issues.append( - { - "severity": "major", - "dimension": "evidence_potency", - "description": ( - "现有最佳证据无法直接回答PICO临床问题," - "请调整检索策略以找到更直接相关的证据。" - ), - } - ) - - pico_match_labels = { - "pico_p_match": "Patient人群", - "pico_i_match": "Intervention干预", - "pico_o_match": "Outcome结局", - } - for key, label in pico_match_labels.items(): - val = evidence_audit.get(key, "YES") - if val == "NO": - issues.append( - { - "severity": "major", - "dimension": "pico_relevance", - "description": ( - f"证据与PICO的 {label} 严重不匹配," - "请调整检索词以找到更匹配的证据。" - ), - } - ) - elif val == "PARTIAL": - issues.append( - { - "severity": "minor", - "dimension": "pico_relevance", - "description": ( - f"证据与PICO的 {label} 存在间接性," - "请在后续评价阶段注意外推限制。" - ), - } - ) - - # --- Listwise selection quality --- - # Mapped onto existing dimensions: selection order → evidence_potency adjustment; - # selection count → evidence_hierarchy adjustment. - listwise_audit = audit.get("listwise_audit", {}) - top_sel_val = listwise_audit.get("top_selection_appropriate", "YES") - count_val = listwise_audit.get("selection_count_appropriate", "YES") - - listwise_map = {"YES": 0.0, "PARTIAL": -0.05, "NO": -0.15} - evidence_potency = max(0.0, evidence_potency + listwise_map.get(top_sel_val, 0.0)) - evidence_hierarchy = max(0.0, evidence_hierarchy + listwise_map.get(count_val, 0.0)) - - if top_sel_val == "NO": - issues.append( - { - "severity": "major", - "dimension": "evidence_potency", - "description": ( - "Listwise排序结果不合理:排名靠前的文献并非最优证据," - "或纳入了明显不相关的文献。请重新审视选择策略。" - ), - } - ) - elif top_sel_val == "PARTIAL": - issues.append( - { + # Gate: invalid search terms + if gate_results.get("search_terms_valid") == "NO": + issues.append({ + "severity": "critical", + "dimension": "search_terms_valid", + "description": "检索词构建有误,检索方向完全错误。请根据PICO重新设计检索策略。", + }) + return {"overall": 0.0}, issues, False, "检索词严重错误,无法获取有效证据。" + + # Rubric scoring via shared helper + dim_scores, rubric_issues, overall_score = _score_rubrics("Acquire", audit) + issues.extend(rubric_issues) + + # Translate failures[] from LLM into additional context + for failure_msg in audit.get("failures", []): + if failure_msg and not any(failure_msg in i.get("description", "") for i in issues): + issues.append({ "severity": "minor", - "dimension": "evidence_potency", - "description": "Listwise排名顺序有轻微偏差,个别文献的排名位置有待优化。", - } - ) - - if count_val == "NO": - issues.append( - { - "severity": "major", - "dimension": "evidence_hierarchy", - "description": ( - "选择数量明显不合理(有效候选很多却仅选极少篇,或质量差仍凑满10篇)," - "请重新调整Listwise筛选标准。" - ), - } - ) + "dimension": "acquire_detail", + "description": failure_msg, + }) - dimension_scores = { - "evidence_potency": evidence_potency, - "evidence_hierarchy": evidence_hierarchy, - "pico_relevance": pico_relevance, - } - return dimension_scores, issues, search_exhausted, audit.get("reasoning", "") + return dim_scores, issues, search_exhausted, audit.get("reasoning", "") def _score_appraise(audit: Dict) -> Tuple[Dict[str, float], List[Dict], bool, str]: - """ - Convert Appraise audit classifications to dimension scores and issues. - - The judge now evaluates the Appraise Agent's structured output which - contains explicit GRADE factor classifications (study_type, risk_of_bias, - etc.) and Python-computed grades. The audit fields reflect this: - - grade_audit.study_type_correct: was study type identified correctly? - - grade_audit.downgrade_factors_appropriate: were factor labels reasonable? - - grade_audit.computed_grade_reasonable: does the final grade make sense? + """Convert Appraise audit classifications to dimension scores and issues. + + Reads gate_results/rubric_results from appraise_judge.txt output. + Gates: study_type_correct, computed_grade_reasonable + Rubrics: downgrade_factors_appropriate, included_study_type_correct, + upgrade_factors_appropriate, upgrade_blocked_appropriate, + conflicts_identified, numerical_data_extracted """ issues: List[Dict] = [] - grade_audit = audit.get("grade_audit", {}) - conflict_audit = audit.get("conflict_audit", {}) - data_audit = audit.get("data_audit", {}) - - # --- grade_reasonableness --- - # 30% study_type correctness + 30% downgrade factor quality + 40% computed grade - type_map = {"YES": 1.0, "PARTIAL": 0.70, "NO": 0.0} - factor_map = {"YES": 1.0, "PARTIAL": 0.60, "NO": 0.15} - grade_map = {"YES": 1.0, "PARTIAL": 0.65, "NO": 0.10} - - type_val = grade_audit.get("study_type_correct", "YES") - factor_val = grade_audit.get("downgrade_factors_appropriate", "YES") - grade_val = grade_audit.get("computed_grade_reasonable", "YES") - - grade_reasonableness = ( - 0.30 * type_map.get(type_val, 1.0) - + 0.30 * factor_map.get(factor_val, 1.0) - + 0.40 * grade_map.get(grade_val, 1.0) - ) - - if type_val == "NO": - issues.append( - { - "severity": "critical", - "dimension": "grade_reasonableness", - "description": ( - "研究类型(study_type)分类存在明显错误,导致GRADE初始等级错误。" - "请重新识别每篇研究的设计类型:RCT / COHORT / CASE_CONTROL / CASE_REPORT。" - ), - } - ) - elif type_val == "PARTIAL": - issues.append( - { - "severity": "major", - "dimension": "grade_reasonableness", - "description": "部分研究的类型识别有误,请复查并修正错误的study_type标签。", - } - ) - - if factor_val == "NO": - issues.append( - { - "severity": "major", - "dimension": "grade_reasonableness", - "description": ( - "GRADE降级因素评估存在明显错误(如将未盲法RCT标记为NOT_SERIOUS偏倚风险)。" - "请重新评估各降级因素(risk_of_bias、inconsistency、indirectness、imprecision)。" - ), - } - ) - elif factor_val == "PARTIAL": - issues.append( - { - "severity": "minor", - "dimension": "grade_reasonableness", - "description": "个别降级因素的严重程度评估过于宽松或严苛,请复查相关分类依据。", - } - ) + gate_results = audit.get("gate_results", {}) + search_exhausted = bool(audit.get("search_exhausted", False)) - if grade_val == "NO": - issues.append( - { - "severity": "critical", - "dimension": "grade_reasonableness", - "description": ( - "系统计算出的最终GRADE等级(computed_grade)明显不合理," - "根本原因通常是study_type或降级因素分类错误。请修正上游分类标签。" - ), - } - ) - elif grade_val == "PARTIAL": - issues.append( - { + # G1 → G2 dependency: if study_type is wrong, G2 is automatically UNCERTAIN + # (wrong study_type already captured by G1; don't double-penalise with G2) + g1 = gate_results.get("study_type_correct", "YES") + g2 = gate_results.get("computed_grade_reasonable", "YES") + if g1 == "NO": + g2 = "UNCERTAIN" + + if g2 == "NO": + # Hard gate: calculation is clearly wrong + issues.append({ + "severity": "critical", + "dimension": "computed_grade_reasonable", + "description": "Gate 失败: computed_grade_reasonable — GRADE等级计算明显不合理", + }) + return {"overall": 0.0}, issues, False, "Gate失败: computed_grade_reasonable" + + if g2 == "UNCERTAIN": + # Soft warning: study design ambiguous, can't verify — demote to MAJOR + issues.append({ + "severity": "major", + "dimension": "computed_grade_reasonable", + "description": "computed_grade_reasonable 无法确认(研究设计有歧义或摘要信息不足)", + }) + + # Rubric scoring + dim_scores, rubric_issues, overall_score = _score_rubrics("Appraise", audit) + issues.extend(rubric_issues) + + # study_type_correct as a major issue (not gate-level) + if gate_results.get("study_type_correct") == "NO": + issues.append({ + "severity": "major", + "dimension": "study_type_correct", + "description": "存在研究类型识别错误,请检查 study_type 标注", + }) + + for failure_msg in audit.get("failures", []): + if failure_msg and not any(failure_msg in i.get("description", "") for i in issues): + issues.append({ "severity": "minor", - "dimension": "grade_reasonableness", - "description": "个别研究的计算等级与预期有轻微偏差,可接受但建议核查分类标签。", - } - ) - - # --- conflict_identification --- - conflicts_exist = conflict_audit.get("conflicts_exist", "NO") - conflicts_id_val = conflict_audit.get("conflicts_identified", "NA") - - if conflicts_exist == "NO": - conflict_identification = 1.0 # Correctly assessed no conflict - else: - conflict_id_map = {"YES": 1.0, "PARTIAL": 0.5, "NO": 0.0, "NA": 1.0} - conflict_identification = conflict_id_map.get(conflicts_id_val, 0.0) - if conflicts_id_val == "NO": - issues.append( - { - "severity": "major", - "dimension": "conflict_identification", - "description": ( - "证据间存在实质性冲突但未被识别," - "请重新比较各研究的结论方向并分析冲突原因。" - ), - } - ) - elif conflicts_id_val == "PARTIAL": - issues.append( - { - "severity": "minor", - "dimension": "conflict_identification", - "description": "证据冲突识别不完整,conflict_description需补充遗漏的冲突说明。", - } - ) - - # --- numerical_confidence --- - # data_score: did the agent correctly assess what data is available? - data_map = {"YES": 1.0, "PARTIAL": 0.65, "NO": 0.1, "NA": 0.85} - data_val = data_audit.get("numerical_data_extracted", "PARTIAL") - data_score = data_map.get(data_val, 0.65) - - # confidence_accuracy: was the confidence_level label itself appropriate? - # HIGH = accurate label (no over/under-confidence), VERY_LOW = serious mismatch - confidence_accuracy_map = { - "HIGH": 1.0, - "MODERATE": 0.75, - "LOW": 0.35, - "VERY_LOW": 0.10, - } - conf_acc_val = data_audit.get("confidence_level_appropriate", "MODERATE") - confidence_accuracy = confidence_accuracy_map.get(conf_acc_val, 0.75) - - numerical_confidence = 0.5 * data_score + 0.5 * confidence_accuracy - - if data_val == "NO": - issues.append( - { - "severity": "major", - "dimension": "numerical_confidence", - "description": ( - "摘要中存在数值数据但未被提取或评估," - "请补充提取关键数值指标(效应量、置信区间、P值等)。" - ), - } - ) - if conf_acc_val == "LOW": - issues.append( - { - "severity": "major", - "dimension": "numerical_confidence", - "description": ( - "置信度标签评估过高(实际数值不可靠)," - "请下调confidence_level并在推荐中标注数值的不确定性。" - ), - } - ) - elif conf_acc_val == "VERY_LOW": - issues.append( - { - "severity": "major", - "dimension": "numerical_confidence", - "description": ( - "置信度标签与实际数据质量严重不符," - "请重新评估数值可靠性,必要时请求人工审核。" - ), - } - ) + "dimension": "appraise_detail", + "description": failure_msg, + }) - dimension_scores = { - "grade_reasonableness": grade_reasonableness, - "conflict_identification": conflict_identification, - "numerical_confidence": numerical_confidence, - } - return dimension_scores, issues, False, audit.get("reasoning", "") + return dim_scores, issues, search_exhausted, audit.get("reasoning", "") def _score_apply(audit: Dict) -> Tuple[Dict[str, float], List[Dict], bool, str]: - """Convert Apply audit classifications to dimension scores and issues.""" + """Convert Apply audit classifications to dimension scores and issues. + + Reads gate_results/rubric_results from apply_judge.txt output. + Gates: recommendation_grounded_in_evidence, route_dimension_consistent, + strength_not_grossly_inflated + Rubrics: effect_size_correctly_reported, strength_matches_evidence, + population_applicability_addressed, uncertainty_source_explained, + citation_traceable, recommendation_specific, patient_preference_considered + """ issues: List[Dict] = [] - grounding_audit = audit.get("grounding_audit", {}) - strength_audit = audit.get("strength_audit", {}) - actionability_audit = audit.get("actionability_audit", {}) - - # --- evidence_alignment --- - grounding_map = {"YES": 1.0, "PARTIAL": 0.55, "NO": 0.1} - rec_based_val = grounding_audit.get("recommendation_based_on_evidence", "YES") - uses_external_val = grounding_audit.get("uses_external_knowledge", "NO") - - base_alignment = grounding_map.get(rec_based_val, 1.0) - external_penalty = 0.25 if uses_external_val == "YES" else 0.0 - evidence_alignment = max(0.0, base_alignment - external_penalty) + gate_results = audit.get("gate_results", {}) + search_exhausted = bool(audit.get("search_exhausted", False)) - if rec_based_val == "NO": - issues.append( - { - "severity": "critical", - "dimension": "evidence_alignment", - "description": ( - "推荐与提供的证据无关或方向相反," - "请严格基于本次检索的证据重新生成推荐,不得引入外部知识。" - ), - } - ) - elif rec_based_val == "PARTIAL": - issues.append( - { - "severity": "major", - "dimension": "evidence_alignment", - "description": ( - "推荐部分超出证据范围,请移除未有本次检索证据支持的推断内容。" - ), - } - ) - if uses_external_val == "YES": - issues.append( - { - "severity": "major", - "dimension": "evidence_alignment", - "description": ( - "检测到使用了外部知识(如'通常认为'、'临床经验')替代证据," - "请仅基于本次提供的证据列表生成推荐。" - ), - } - ) + # Gates + gate_failures = [] + for gate_key in ("recommendation_grounded_in_evidence", "route_dimension_consistent", + "strength_not_grossly_inflated"): + if gate_results.get(gate_key) == "NO": + gate_failures.append(gate_key) - # --- strength_appropriateness --- - insuf_val = strength_audit.get("insufficient_evidence_appropriate", "NA") - if insuf_val == "YES": - strength_appropriateness = 1.0 - elif insuf_val == "NO": - strength_appropriateness = 0.15 - issues.append( - { + if gate_failures: + for g in gate_failures: + issues.append({ "severity": "critical", - "dimension": "strength_appropriateness", - "description": ( - "证据充足但错误输出了'证据不足'声明," - "请根据现有证据质量给出对应强度的推荐。" - ), - } - ) - else: # NA — a normal recommendation was produced - strength_map = {"YES": 1.0, "MINOR_MISMATCH": 0.65, "MAJOR_MISMATCH": 0.15} - strength_val = strength_audit.get("strength_matches_evidence_quality", "YES") - strength_appropriateness = strength_map.get(strength_val, 1.0) - if strength_val == "MAJOR_MISMATCH": - issues.append( - { - "severity": "critical", - "dimension": "strength_appropriateness", - "description": ( - "推荐强度与证据质量严重不符(如Very Low证据给出Strong推荐)," - "请依据GRADE原则重新确定推荐强度:Very Low→Weak或证据不足;" - "Low→Weak;Moderate→Conditional/Moderate;High→Strong。" - ), - } - ) - elif strength_val == "MINOR_MISMATCH": - issues.append( - { - "severity": "major", - "dimension": "strength_appropriateness", - "description": ( - "推荐强度与证据质量存在轻微不符," - "请检查证据质量等级与推荐强度是否严格匹配。" - ), - } - ) + "dimension": g, + "description": f"Gate 失败: {g}", + }) + return {"overall": 0.0}, issues, False, f"Gate失败: {', '.join(gate_failures)}" - # --- actionability --- - specific_map = {"YES": 1.0, "PARTIAL": 0.6, "NO": 0.1} - caveats_map = {"YES": 1.0, "PARTIAL": 0.6, "NO": 0.2, "NA": 1.0} - specific_val = actionability_audit.get("recommendation_specific", "YES") - caveats_val = actionability_audit.get("caveats_documented", "NA") - actionability = 0.6 * specific_map.get(specific_val, 1.0) + 0.4 * caveats_map.get( - caveats_val, 1.0 - ) + # Rubric scoring + dim_scores, rubric_issues, overall_score = _score_rubrics("Apply", audit) + issues.extend(rubric_issues) - if specific_val == "NO": - issues.append( - { - "severity": "major", - "dimension": "actionability", - "description": ( - "推荐过于模糊,临床医生无法据此执行," - "请提供更具体的推荐内容(如适应症、用药剂量、疗程等关键参数)。" - ), - } - ) - elif specific_val == "PARTIAL": - issues.append( - { - "severity": "minor", - "dimension": "actionability", - "description": "推荐可以更加具体,请补充关键临床细节以提高可操作性。", - } - ) - if caveats_val == "NO": - issues.append( - { - "severity": "minor", - "dimension": "actionability", - "description": ( - "存在重要的适用性限制或PICO不匹配未在caveats中说明," - "请补充相关注意事项。" - ), - } - ) - elif caveats_val == "PARTIAL": - issues.append( - { + for failure_msg in audit.get("failures", []): + if failure_msg and not any(failure_msg in i.get("description", "") for i in issues): + issues.append({ "severity": "minor", - "dimension": "actionability", - "description": "caveats中部分重要限制未说明,请补充完整。", - } - ) + "dimension": "apply_detail", + "description": failure_msg, + }) - dimension_scores = { - "evidence_alignment": evidence_alignment, - "strength_appropriateness": strength_appropriateness, - "actionability": actionability, - } - return dimension_scores, issues, False, audit.get("reasoning", "") + return dim_scores, issues, search_exhausted, audit.get("reasoning", "") def _score_assess(audit: Dict) -> Tuple[Dict[str, float], List[Dict], bool, str]: @@ -994,7 +667,9 @@ def _score_assess(audit: Dict) -> Tuple[Dict[str, float], List[Dict], bool, str] "reasoning_chain": reasoning_chain, "logical_consistency": logical_consistency, } - return dimension_scores, issues, False, audit.get("reasoning", "") + failures = audit.get("failures", []) + hint = "; ".join(failures) if failures else f"综合评分: {(answer_completeness + reasoning_chain + logical_consistency) / 3:.2f}" + return dimension_scores, issues, False, hint # Dispatch table: stage name -> scorer function @@ -1057,17 +732,22 @@ def _calculate_overall_score( self, stage: str, dimension_scores: Dict[str, float] ) -> float: """Calculate weighted overall score from dimension scores.""" - weights = STAGE_WEIGHTS.get(stage, {}) + # direct_answer Ask path uses a single "nonresearch" dimension that isn't + # in RUBRIC_WEIGHTS — return it directly so it isn't dropped to 0. + if stage == "Ask" and "nonresearch" in dimension_scores: + return dimension_scores["nonresearch"] + + weights = {dim: w for dim, (w, _) in RUBRIC_WEIGHTS.get(stage, {}).items()} if not weights: - return ( - sum(dimension_scores.values()) / len(dimension_scores) - if dimension_scores - else 0.0 - ) + valid = [v for v in dimension_scores.values() if v is not None] + return sum(valid) / len(valid) if valid else 0.0 total = 0.0 weight_sum = 0.0 for dim, weight in weights.items(): - total += dimension_scores.get(dim, 0.0) * weight + val = dimension_scores.get(dim) + if val is None: + continue # NA — excluded from denominator + total += val * weight weight_sum += weight return total / weight_sum if weight_sum > 0 else 0.0 @@ -1130,6 +810,14 @@ def evaluate_stage( summary = reasoning_hint if reasoning_hint else f"综合评分: {overall_score:.2f}" + # For Ask stage: derive routing decision and write to state as side effect + if stage == "Ask": + retry_count = state.get("agent_call_counts", {}).get("Ask", 1) - 1 + routing_decision = _derive_routing_decision(audit, pass_threshold, retry_count) + state["_ask_routing_decision"] = routing_decision # type: ignore[typeddict-unknown-key] + if routing_decision == "fallback": + state["route_confidence"] = 0.0 # type: ignore[typeddict-item] + evaluation = Evaluation( overall_score=overall_score, dimension_scores=dimension_scores, @@ -1149,27 +837,46 @@ def _prepare_context( self, stage: str, output: Dict[str, Any], state: WorkflowState ) -> Dict[str, Any]: """Prepare context variables for judge prompt based on stage.""" - context = {"stage_output": self._format_stage_output(output)} + context = { + "stage_output": self._format_stage_output(output), + "route_type": state.get("route_type") or "full_pipeline", + } if stage == "Ask": context["original_question"] = state["original_question"] + context["route_type"] = state.get("route_type") or "full_pipeline" elif stage == "Acquire": + ebm_q = state.get("ebm_query") pico = state.get("pico_query") - if pico: - context["pico_query"] = json.dumps( + if ebm_q: + context["ebm_query"] = json.dumps( + { + "query_type": ebm_q.query_type, + "patient": ebm_q.patient, + "primary_focus": ebm_q.primary_focus, + "outcome": ebm_q.outcome, + "keywords": ebm_q.keywords, + "comparator": ebm_q.comparator, + }, + ensure_ascii=False, + indent=2, + ) + elif pico: + context["ebm_query"] = json.dumps( { + "query_type": "pico", "patient": pico.patient, - "intervention": pico.intervention, - "comparison": pico.comparison, + "primary_focus": pico.intervention, "outcome": pico.outcome, "keywords": pico.keywords, + "comparator": pico.comparison, }, ensure_ascii=False, indent=2, ) else: - context["pico_query"] = "N/A" + context["ebm_query"] = "N/A" # Condense evidence list to avoid context overflow raw_output = output @@ -1184,6 +891,10 @@ def _prepare_context( "pmid": getattr(e, "pmid", ""), "study_type": getattr(e, "study_type", "Unknown"), "relevance_score": getattr(e, "relevance_score", 0.0), + # has_full_text and key_sentences let the Judge verify + # key_sentences_present without guessing from other fields. + "has_full_text": getattr(e, "has_full_text", False), + "has_key_sentences": bool(getattr(e, "key_sentences", None)), "abstract_preview": (getattr(e, "abstract", "") or "")[:200], } ) @@ -1216,6 +927,11 @@ def _prepare_context( "source": e.source, "pmid": e.pmid, "relevance_score": e.relevance_score, + # pub_types is the authoritative study design field used by AppraiseAgent. + # Including it here lets the Judge verify study_type using the same + # source of truth, eliminating abstract-text vs metadata divergence. + "pub_types": getattr(e, "pub_types", None) or [], + "abstract": (getattr(e, "abstract", "") or "")[:300], } for e in evidence_list ], @@ -1225,8 +941,8 @@ def _prepare_context( # Override stage_output: strip full abstracts from Evidence objects. # The Judge audits GRADE classification labels (study_type, risk_of_bias, etc.), - # not the raw text — abstracts are already shown via evidence_list above. - # Removing them cuts ~4000 redundant chars (~1000 tokens) from the Judge prompt. + # not the raw text — abstracts are shown via evidence_list above (truncated to 300 chars). + # Removing full abstracts here cuts ~4000 redundant chars (~1000 tokens) from the Judge prompt. appraisal = output.get("appraisal_results") if appraisal and is_dataclass(appraisal): appraisal_d = asdict(appraisal) @@ -1250,28 +966,38 @@ def _prepare_context( ) elif stage == "Apply": + ebm_q = state.get("ebm_query") pico = state.get("pico_query") - if pico: - context["pico_query"] = json.dumps( - { - "patient": pico.patient, - "intervention": pico.intervention, - "comparison": pico.comparison, - "outcome": pico.outcome, - }, - ensure_ascii=False, - indent=2, + if ebm_q: + context["query_description"] = ( + f"{ebm_q.query_type}: {ebm_q.patient} / {ebm_q.primary_focus} / " + f"{ebm_q.comparator or 'N/A'} / {ebm_q.outcome}" + ) + elif pico: + context["query_description"] = ( + f"pico: {pico.patient} / {pico.intervention} / " + f"{pico.comparison} / {pico.outcome}" ) else: - context["pico_query"] = "N/A" + context["query_description"] = "N/A" appraisal = state.get("appraisal_results") if appraisal: + # Include per-evidence grade breakdown so the Judge can verify + # strength_matches_evidence using the same adopted-evidence view + # that Apply Agent used (evidence_quality field), not just the + # aggregate summary which hides which studies were excluded. + grade_rationales = state.get("grade_rationales") or [] + grade_breakdown = [ + {"title": r.get("title", ""), "computed_grade": r.get("computed_grade", "")} + for r in grade_rationales + ] context["appraisal_results"] = json.dumps( { "evidence_count": len(appraisal.evidence), "has_conflict": appraisal.has_conflict, "summary": appraisal.summary, + "grade_breakdown": grade_breakdown, }, ensure_ascii=False, indent=2, @@ -1281,14 +1007,24 @@ def _prepare_context( elif stage == "Assess": context["original_question"] = state["original_question"] + context["route_confidence"] = state.get("route_confidence") or "N/A" + ebm_q = state.get("ebm_query") pico = state.get("pico_query") - if pico: + if ebm_q: + context["ebm_query_description"] = ( + f"{ebm_q.query_type}: {ebm_q.patient} / {ebm_q.primary_focus} / " + f"{ebm_q.comparator or 'N/A'} / {ebm_q.outcome}" + ) + context["pico_query"] = context["ebm_query_description"] + elif pico: context["pico_query"] = ( f"{pico.patient} / {pico.intervention} / {pico.comparison} / {pico.outcome}" ) + context["ebm_query_description"] = context["pico_query"] else: context["pico_query"] = "N/A" + context["ebm_query_description"] = "N/A" evidence_list = state.get("evidence_list", []) context["evidence_count"] = len(evidence_list) diff --git a/src/main.py b/src/main.py index 991385e..a98f9f9 100644 --- a/src/main.py +++ b/src/main.py @@ -4,8 +4,25 @@ """ import sys +import io +import time +from concurrent.futures import ThreadPoolExecutor from typing import Dict, Any -from src.config.llm_config import get_llm, get_fast_llm + +# Force UTF-8 on Windows to avoid GBK encoding errors with Unicode characters. +# line_buffering=True so [TIMING] / stage markers flush as they happen — without +# this, when stdout is redirected to a file the buffer is block-sized and a +# subprocess killed mid-run produces an empty log, making timeouts invisible. +if sys.stdout.encoding and sys.stdout.encoding.lower() != "utf-8": + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace", line_buffering=True) + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace", line_buffering=True) +else: + try: + sys.stdout.reconfigure(line_buffering=True) + sys.stderr.reconfigure(line_buffering=True) + except (AttributeError, ValueError): + pass +from src.config.llm_config import get_llm, get_fast_llm, get_cache_stats, get_ttft_samples from src.agents.ask_agent import AskAgent from src.agents.acquire_agent import AcquireAgent from src.agents.appraise_agent import AppraiseAgent @@ -16,6 +33,29 @@ from src.coordinator.coordinator import Coordinator +def _warmup_llms() -> None: + """Fire one minimal request to each LLM purpose in parallel so HTTP + connections and the upstream model are pre-warmed before the pipeline's + first real call. Failures are swallowed — warmup must never block the run.""" + t0 = time.time() + clients = [ + ("agent", get_llm(temperature=0.0, purpose="agent")), + ("judge", get_fast_llm(temperature=0.0, purpose="judge")), + ("scheduling", get_fast_llm(temperature=0.0, purpose="scheduling")), + ] + + def _ping(name_client): + name, client = name_client + try: + client.invoke("ok") + except Exception as exc: + print(f"[WARMUP] {name} failed (non-fatal): {exc}") + + with ThreadPoolExecutor(max_workers=len(clients)) as pool: + list(pool.map(_ping, clients)) + print(f"[TIMING] warmup: {time.time() - t0:.2f}s") + + def create_workflow() -> Coordinator: """ Create and configure the workflow coordinator with all agents @@ -23,9 +63,10 @@ def create_workflow() -> Coordinator: Returns: Configured Coordinator instance """ + _warmup_llms() + # Initialize LLM - llm = get_llm(temperature=0.0) - fast_llm = get_fast_llm(temperature=0.0) + llm = get_llm(temperature=0.0, purpose="agent") # Initialize agents agents = { @@ -37,10 +78,10 @@ def create_workflow() -> Coordinator: } # Initialize Judge LLM (use fast model for classification tasks) - judge_llm = JudgeLLM(llm=fast_llm) + judge_llm = JudgeLLM(llm=get_fast_llm(temperature=0.0, purpose="judge")) # Initialize Scheduling LLM (use fast model for classification tasks) - scheduling_llm = SchedulingLLM(llm=fast_llm) + scheduling_llm = SchedulingLLM(llm=get_fast_llm(temperature=0.0, purpose="scheduling")) # Create coordinator coordinator = Coordinator( @@ -198,8 +239,23 @@ def format_output(state: Dict[str, Any]) -> str: rec = state.get("recommendation") assess = state.get("assessment") + direct = state.get("direct_answer_output") - if rec: + if state.get("route_type") == "direct_answer" and direct: + output.append(f"A: {direct.get('answer', '[empty]')}") + output.append("") + basis = direct.get("answer_basis") + guideline = direct.get("guideline_source") + if basis: + output.append(f" Answer Basis : {basis}") + if guideline: + output.append(f" Guideline Source : {guideline}") + caveats = direct.get("caveats") or [] + if caveats: + output.append(" Caveats :") + for c in caveats: + output.append(f" • {c}") + elif rec: output.append(f"A: {rec.text}") output.append("") output.append(f" Recommendation Strength : {rec.strength}") @@ -247,6 +303,33 @@ def main(): result = run_clinical_question(question) output = format_output(result) print(output) + stats = get_cache_stats() + # huatuogpt.cn gateway reports prefix caching by reducing prompt_tokens + # rather than via cached_tokens — so we surface raw totals; comparing + # prompt_tokens across runs of the same workflow shows the cache effect. + print( + f"[CACHE] calls={stats['calls']} " + f"total_prompt_tokens={stats['prompt_tokens']} " + f"cached_tokens(openai-style)={stats['cached_tokens']}" + ) + + # Per-purpose TTFT (time-to-first-token) summary from streamed calls. + ttft_data = get_ttft_samples() + if ttft_data: + print("[TTFT] per-purpose summary (ttft / total elapsed in seconds):") + for purpose, samples in sorted(ttft_data.items()): + valid_ttft = [s["ttft"] for s in samples if s["ttft"] is not None] + elapsed = [s["elapsed"] for s in samples] + if not valid_ttft: + continue + avg_ttft = sum(valid_ttft) / len(valid_ttft) + med_ttft = sorted(valid_ttft)[len(valid_ttft) // 2] + avg_elapsed = sum(elapsed) / len(elapsed) + print( + f" {purpose:14s} n={len(samples):2d} " + f"ttft avg={avg_ttft:5.2f}s med={med_ttft:5.2f}s " + f"elapsed avg={avg_elapsed:5.2f}s" + ) except Exception as e: print(f"Error: {e}") import traceback diff --git a/src/scheduling/scheduling_llm.py b/src/scheduling/scheduling_llm.py index 10138a4..a6f3100 100644 --- a/src/scheduling/scheduling_llm.py +++ b/src/scheduling/scheduling_llm.py @@ -75,7 +75,7 @@ def _prepare_context( # Format dimension scores dimension_scores_str = "\n".join( [ - f" - {dim}: {score:.2f}" + f" - {dim}: {score:.2f}" if score is not None else f" - {dim}: N/A" for dim, score in observe.evaluation.dimension_scores.items() ] ) diff --git a/src/state/schema.py b/src/state/schema.py index 84a62e4..1f55e36 100644 --- a/src/state/schema.py +++ b/src/state/schema.py @@ -44,6 +44,7 @@ class Evidence: full_text: Optional[str] = None # Full text (local DB only, not passed to prompts) key_sentences: Optional[str] = None # Extracted span(s) relevant to query keywords has_full_text: bool = False # True when full_text field is populated + pub_types: Optional[List[str]] = None # PubMed publication types (e.g. ["Randomized Controlled Trial"]) @dataclass diff --git a/src/tools/pubmed_api.py b/src/tools/pubmed_api.py index 99a47c9..7bd24f8 100644 --- a/src/tools/pubmed_api.py +++ b/src/tools/pubmed_api.py @@ -252,6 +252,19 @@ def search_pubmed( abstract = abstracts.get(pmid, "") pmcid = pmc_ids.get(pmid) # None if not in PMC open-access + # Extract publication types from esummary pubtype list. + # Each entry is either a plain string or a dict with a "value" key + # depending on the API version — handle both forms. + raw_pubtypes = article.get("pubtype", []) + pub_types = [] + for pt in raw_pubtypes: + if isinstance(pt, str): + pub_types.append(pt) + elif isinstance(pt, dict): + v = pt.get("value") or pt.get("name") or "" + if v: + pub_types.append(v) + evidence = Evidence( title=article.get("title", "No title"), source=article.get("source", "PubMed"), @@ -263,6 +276,7 @@ def search_pubmed( grade_level=None, pmcid=pmcid, has_full_text=pmcid is not None, + pub_types=pub_types or None, ) evidence_list.append(evidence) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/agents/__init__.py b/tests/agents/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/agents/test_acquire_agent.py b/tests/agents/test_acquire_agent.py new file mode 100644 index 0000000..47ad138 --- /dev/null +++ b/tests/agents/test_acquire_agent.py @@ -0,0 +1,48 @@ +import pytest +from unittest.mock import Mock, patch +from src.agents.acquire_agent import AcquireAgent +from src.state.schema import WorkflowState, PICOQuery, Evidence + +@pytest.fixture +def mock_llm(): + llm = Mock() + llm.invoke = Mock(return_value=Mock( + content="aspirin AND primary prevention" + )) + return llm + +@pytest.fixture +def sample_state(): + return WorkflowState( + original_question="Should I prescribe aspirin?", + current_step="acquire", + iteration_count=1, + agent_call_counts={}, + execution_history=[], + pico_query=PICOQuery( + patient="60yo male", + intervention="aspirin", + comparison="placebo", + outcome="cardiovascular events", + keywords=["aspirin", "primary prevention"] + ) + ) + +@patch('src.agents.acquire_agent.search_pubmed') +def test_acquire_agent_execute_returns_evidence(mock_search, mock_llm, sample_state): + """Test that AcquireAgent returns evidence list""" + mock_search.return_value = [ + Evidence( + title="Aspirin study", + source="JAMA", + pmid="12345", + abstract="Study on aspirin", + relevance_score=0.9 + ) + ] + + agent = AcquireAgent(llm=mock_llm, tools=[]) + result = agent.execute(sample_state) + + assert "evidence_list" in result + assert len(result["evidence_list"]) > 0 diff --git a/tests/agents/test_apply_agent.py b/tests/agents/test_apply_agent.py new file mode 100644 index 0000000..e52ccdc --- /dev/null +++ b/tests/agents/test_apply_agent.py @@ -0,0 +1,46 @@ +import pytest +from unittest.mock import Mock +from src.agents.apply_agent import ApplyAgent +from src.state.schema import WorkflowState, Evidence, AppraisalResults, Recommendation + +@pytest.fixture +def mock_llm(): + llm = Mock() + llm.invoke = Mock(return_value=Mock( + content='{"recommendation": "Consider aspirin with caution", "strength": "Weak", "rationale": "Moderate evidence with bleeding risk", "caveats": ["Monitor for bleeding"]}' + )) + return llm + +@pytest.fixture +def sample_state(): + return WorkflowState( + original_question="Should I prescribe aspirin?", + current_step="apply", + iteration_count=1, + agent_call_counts={}, + execution_history=[], + appraisal_results=AppraisalResults( + evidence=[ + Evidence( + title="Study 1", + source="JAMA", + pmid="123", + abstract="RCT on aspirin", + relevance_score=0.9, + grade_level="Moderate" + ) + ], + has_conflict=False, + conflict_description=None, + summary="Moderate quality evidence" + ) + ) + +def test_apply_agent_execute_returns_recommendation(mock_llm, sample_state): + """Test that ApplyAgent returns Recommendation""" + agent = ApplyAgent(llm=mock_llm, tools=[]) + result = agent.execute(sample_state) + + assert "recommendation" in result + assert isinstance(result["recommendation"], Recommendation) + assert result["recommendation"].strength in ["Strong", "Weak"] diff --git a/tests/agents/test_appraise_agent.py b/tests/agents/test_appraise_agent.py new file mode 100644 index 0000000..5cb2e35 --- /dev/null +++ b/tests/agents/test_appraise_agent.py @@ -0,0 +1,40 @@ +import pytest +from unittest.mock import Mock +from src.agents.appraise_agent import AppraiseAgent +from src.state.schema import WorkflowState, Evidence, AppraisalResults + +@pytest.fixture +def mock_llm(): + llm = Mock() + llm.invoke = Mock(return_value=Mock( + content='{"grades": ["Moderate"], "has_conflict": false, "summary": "Good quality evidence"}' + )) + return llm + +@pytest.fixture +def sample_state(): + return WorkflowState( + original_question="Should I prescribe aspirin?", + current_step="appraise", + iteration_count=1, + agent_call_counts={}, + execution_history=[], + evidence_list=[ + Evidence( + title="Study 1", + source="JAMA", + pmid="123", + abstract="RCT on aspirin", + relevance_score=0.9 + ) + ] + ) + +def test_appraise_agent_execute_returns_appraisal(mock_llm, sample_state): + """Test that AppraiseAgent returns AppraisalResults""" + agent = AppraiseAgent(llm=mock_llm, tools=[]) + result = agent.execute(sample_state) + + assert "appraisal_results" in result + assert isinstance(result["appraisal_results"], AppraisalResults) + assert result["appraisal_results"].evidence[0].grade_level == "Moderate" diff --git a/tests/agents/test_ask_agent.py b/tests/agents/test_ask_agent.py new file mode 100644 index 0000000..f00a90a --- /dev/null +++ b/tests/agents/test_ask_agent.py @@ -0,0 +1,35 @@ +import pytest +from unittest.mock import Mock, MagicMock +from src.agents.ask_agent import AskAgent +from src.state.schema import WorkflowState, PICOQuery + +@pytest.fixture +def mock_llm(): + """Mock LLM that returns PICO structure""" + llm = Mock() + llm.invoke = MagicMock(return_value=Mock( + content='{"patient": "60yo male", "intervention": "aspirin", "comparison": "placebo", "outcome": "cardiovascular events", "keywords": ["aspirin", "primary prevention"]}' + )) + return llm + +def test_ask_agent_initialization(mock_llm): + """Test AskAgent can be initialized""" + agent = AskAgent(llm=mock_llm, tools=[]) + assert agent.agent_type == "Ask" + +def test_ask_agent_execute_returns_pico(mock_llm): + """Test that AskAgent returns PICOQuery""" + agent = AskAgent(llm=mock_llm, tools=[]) + state = WorkflowState( + original_question="Should I prescribe aspirin for a 60yo male?", + current_step="ask", + iteration_count=0, + agent_call_counts={}, + execution_history=[] + ) + + result = agent.execute(state) + + assert "pico_query" in result + assert isinstance(result["pico_query"], PICOQuery) + assert result["pico_query"].patient == "60yo male" diff --git a/tests/agents/test_assess_agent.py b/tests/agents/test_assess_agent.py new file mode 100644 index 0000000..73af1ed --- /dev/null +++ b/tests/agents/test_assess_agent.py @@ -0,0 +1,38 @@ +import pytest +from unittest.mock import Mock +from src.agents.assess_agent import AssessAgent +from src.state.schema import WorkflowState, Recommendation, Assessment + +@pytest.fixture +def mock_llm(): + llm = Mock() + llm.invoke = Mock(return_value=Mock( + content='{"quality_score": 0.85, "gaps": [], "needs_backtrack": false}' + )) + return llm + +@pytest.fixture +def sample_state(): + return WorkflowState( + original_question="Should I prescribe aspirin?", + current_step="assess", + iteration_count=1, + agent_call_counts={}, + execution_history=[], + recommendation=Recommendation( + text="Consider aspirin with caution", + strength="Weak", + rationale="Moderate evidence", + caveats=["Monitor for bleeding"], + evidence_quality="Moderate" + ) + ) + +def test_assess_agent_execute_returns_assessment(mock_llm, sample_state): + """Test that AssessAgent returns Assessment""" + agent = AssessAgent(llm=mock_llm, tools=[]) + result = agent.execute(sample_state) + + assert "assessment" in result + assert isinstance(result["assessment"], Assessment) + assert 0 <= result["assessment"].quality_score <= 1 diff --git a/tests/agents/test_base.py b/tests/agents/test_base.py new file mode 100644 index 0000000..6ca450b --- /dev/null +++ b/tests/agents/test_base.py @@ -0,0 +1,26 @@ +import pytest +from unittest.mock import Mock +from src.agents.base import BaseAgent +from src.state.schema import WorkflowState + +def test_base_agent_initialization(): + """Test BaseAgent can be initialized""" + llm = Mock() + agent = BaseAgent(llm=llm, tools=[], agent_type="Test") + assert agent.llm == llm + assert agent.agent_type == "Test" + +def test_base_agent_execute_not_implemented(): + """Test that execute method must be implemented by subclasses""" + llm = Mock() + agent = BaseAgent(llm=llm, tools=[], agent_type="Test") + state = WorkflowState( + original_question="test", + current_step="test", + iteration_count=0, + agent_call_counts={}, + execution_history=[] + ) + + with pytest.raises(NotImplementedError): + agent.execute(state) diff --git a/tests/test_appraise_grade.py b/tests/test_appraise_grade.py index 034077a..9563ebe 100644 --- a/tests/test_appraise_grade.py +++ b/tests/test_appraise_grade.py @@ -8,7 +8,6 @@ COHORT+NOT_SERIOUS+all upgrades → Moderate (cap at min(points, 3)) CROSS_SECTIONAL+all upgrades → Low (not in _UPGRADE_STUDY_TYPES) """ -import pytest from src.agents.appraise_agent import _compute_grade @@ -90,3 +89,49 @@ def test_cross_sectional_upgrades_not_applied(): "dose_response": "YES", # should be ignored } assert _compute_grade(appraisal) == "Low" + + +def test_cohort_confounding_bias_mitigates_upgrade(): + """confounding_bias_mitigates=YES should trigger +1 upgrade (third upgrade factor).""" + appraisal = { + "study_type": "COHORT", + "risk_of_bias": "NOT_SERIOUS", + "inconsistency": "NOT_SERIOUS", + "indirectness": "NOT_SERIOUS", + "imprecision": "NOT_SERIOUS", + "publication_bias": "UNDETECTED", + "large_effect": "NO", + "dose_response": "NO", + "confounding_bias_mitigates": "YES", # +1 + } + assert _compute_grade(appraisal) == "Moderate" # 2+1=3, capped at 3 + + +def test_sr_mixed_moderate(): + """SR with MIXED included studies → initial points 3 → Moderate.""" + appraisal = { + "study_type": "SYSTEMATIC_REVIEW", + "included_study_type": "MIXED", + "risk_of_bias": "NOT_SERIOUS", + "inconsistency": "NOT_SERIOUS", + "indirectness": "NOT_SERIOUS", + "imprecision": "NOT_SERIOUS", + "publication_bias": "UNDETECTED", + } + assert _compute_grade(appraisal) == "Moderate" + + +def test_cohort_serious_bias_blocks_confounding_upgrade(): + """SERIOUS bias blocks confounding_bias_mitigates=YES upgrade.""" + appraisal = { + "study_type": "COHORT", + "risk_of_bias": "SERIOUS", + "inconsistency": "NOT_SERIOUS", + "indirectness": "NOT_SERIOUS", + "imprecision": "NOT_SERIOUS", + "publication_bias": "UNDETECTED", + "large_effect": "NO", + "dose_response": "NO", + "confounding_bias_mitigates": "YES", + } + assert _compute_grade(appraisal) == "Very Low" # 2-1=1, upgrade blocked diff --git a/tests/test_integration_routing.py b/tests/test_integration_routing.py index 6d01d58..8e65111 100644 --- a/tests/test_integration_routing.py +++ b/tests/test_integration_routing.py @@ -9,7 +9,6 @@ """ import json -import pytest from unittest.mock import MagicMock from src.agents.ask_agent import AskAgent from src.state.schema import WorkflowState From 09423a66f39ab54d382952f4d460186e802cb336 Mon Sep 17 00:00:00 2001 From: Winda0001 <13912795021@163.com> Date: Fri, 22 May 2026 15:28:47 +0800 Subject: [PATCH 4/4] feat: replace PubMed pipeline with Hypertension RAG service - Acquire agent now queries hypertension_rag_client (HTTP) instead of PubMed + MedCPT; removes pubmed_api.py and local_evidence_db.py - Add src/tools/hypertension_rag_client.py: retrying HTTP client that aggregates chunk-level results into paper-level Evidence objects - Update state schema: Evidence gains supporting_passages field - Update all agent prompts and judge rubrics for RAG-sourced evidence - Merge Ask router into single LLM call (router_unified.txt) - Add concurrent LLM warm-up at startup to cut cold-start latency - Fix requirements.txt: replace langchain/torch/chromadb stack with openai + httpx; update README to reflect new architecture --- .env.example | 18 + README.md | 28 +- requirements.txt | 16 +- src/agents/acquire_agent.py | 523 +++----------------- src/agents/apply_agent.py | 12 +- src/agents/appraise_agent.py | 73 ++- src/agents/ask_agent.py | 41 +- src/config/llm_config.py | 103 ++++ src/config/prompts/acquire_agent.txt | 47 +- src/config/prompts/acquire_ranking.txt | 47 -- src/config/prompts/apply_agent.txt | 23 +- src/config/prompts/appraise_agent.txt | 4 + src/config/prompts/ask/router_unified.txt | 18 +- src/config/prompts/assess_agent.txt | 7 +- src/config/prompts/judge/acquire_judge.txt | 51 +- src/config/prompts/judge/apply_judge.txt | 22 +- src/config/prompts/judge/appraise_judge.txt | 46 +- src/config/prompts/judge/ask_judge.txt | 13 +- src/coordinator/coordinator.py | 17 +- src/judge/judge_llm.py | 36 +- src/main.py | 97 +++- src/state/schema.py | 67 ++- src/tools/hypertension_rag_client.py | 148 ++++++ src/tools/local_evidence_db.py | 232 --------- src/tools/pubmed_api.py | 298 ----------- tests/agents/test_acquire_agent.py | 48 -- tests/agents/test_apply_agent.py | 46 -- tests/agents/test_appraise_agent.py | 40 -- tests/agents/test_base.py | 26 - 29 files changed, 754 insertions(+), 1393 deletions(-) delete mode 100644 src/config/prompts/acquire_ranking.txt create mode 100644 src/tools/hypertension_rag_client.py delete mode 100644 src/tools/local_evidence_db.py delete mode 100644 src/tools/pubmed_api.py delete mode 100644 tests/agents/test_acquire_agent.py delete mode 100644 tests/agents/test_apply_agent.py delete mode 100644 tests/agents/test_appraise_agent.py delete mode 100644 tests/agents/test_base.py diff --git a/.env.example b/.env.example index 3e3e11c..8eac6e2 100644 --- a/.env.example +++ b/.env.example @@ -27,3 +27,21 @@ PUBMED_EMAIL=your_email@example.com # Use a faster/cheaper model for Judge and Scheduling agents (~30–40% faster overall) # If unset, LLM_MODEL is used for all agents. # FAST_LLM_MODEL=gpt-3.5-turbo + +# ── Hypertension RAG Service ───────────────────────────────────────────────── +# Base URL of the hypertensiondb FastAPI service (see ../hypertension/). +# Must be running before Acquire stage executes. Start it with: +# cd hypertension && hdb serve run --port 8000 +HYPERTENSION_API_URL=http://localhost:8000 + +# HTTP timeout per /search request, in seconds +HYPERTENSION_API_TIMEOUT=10 + +# Number of chunks to request from /search (client aggregates into max_papers papers) +RAG_SEARCH_TOP_K=15 + +# Max papers to surface to downstream agents (after aggregation by evidence_id) +RAG_MAX_PAPERS=6 + +# Max supporting passages to keep per paper +RAG_MAX_PASSAGES_PER_PAPER=3 diff --git a/README.md b/README.md index ec85264..a1941ee 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![OpenAI Compatible](https://img.shields.io/badge/API-OpenAI%20Compatible-412991.svg)](https://platform.openai.com/) -[![PubMed](https://img.shields.io/badge/data-PubMed%20实时检索-326599.svg)](https://pubmed.ncbi.nlm.nih.gov/) +[![RAG](https://img.shields.io/badge/data-Hypertension%20RAG-326599.svg)](https://github.com/FreedomIntelligence/TrueTruth) </div> @@ -29,13 +29,13 @@ TrueTruth将一段普通的临床问题文本全自动转化为**经过分级的 | 特性 | 说明 | |------|------| -| **5A 工作流** | Ask(PICO 结构化)→ Acquire(PubMed 实时检索)→ Appraise(GRADE 评价)→ Apply(生成推荐)→ Assess(质量审查) | +| **5A 工作流** | Ask(PICO 结构化)→ Acquire(RAG 检索)→ Appraise(GRADE 评价)→ Apply(生成推荐)→ Assess(质量审查) | | **GRADE 确定性计算** | LLM 输出分类标签,Python 代码确定性地计算 GRADE 等级,不依赖 LLM 对评级规则的理解 | -| **五种问题类型** | Therapy / Diagnosis / Prognosis / Harm / Prevention,每种类型触发不同 PubMed 检索过滤器 | +| **五种问题类型** | Therapy / Diagnosis / Prognosis / Harm / Prevention,每种类型触发不同检索策略 | | **四级推荐强度** | Strong、Conditional(间接证据)、Consensus-based(指南/专家共识)、Insufficient Evidence | -| **实时文献检索** | 三层 PubMed 查询策略(严格→中等→宽松),24 小时磁盘缓存,MedCPT Listwise 重排序 | +| **Hypertension RAG** | 基于高血压领域 landmark trial 的向量检索服务,提供结构化证据块与 passage 级引用 | | **鲁棒 JSON 解析** | 每个 Agent 与 Judge LLM 内置三阶段 JSON 恢复,LLM 输出格式错误不会导致工作流崩溃 | -| **完整审计追踪** | 每阶段评分(0–1)、问题清单、回退事件、调度决策全部记录,每条引用附带真实 PMID | +| **完整审计追踪** | 每阶段评分(0–1)、问题清单、回退事件、调度决策全部记录,每条引用附带真实来源 | | **OpenAI 兼容** | 支持任何 OpenAI 兼容 API;Judge/Scheduling 可配置独立的快速模型以降低成本 | ### 工作流程 @@ -66,7 +66,7 @@ TrueTruth将一段普通的临床问题文本全自动转化为**经过分级的 - Python **3.10+** - 兼容 OpenAI 的 API Key(OpenAI、Azure OpenAI 或其他兼容服务商) -- 免费的 PubMed 注册邮箱([NCBI API 使用规范](https://www.ncbi.nlm.nih.gov/home/about/policies/)要求) +- Hypertension RAG 服务(见下方说明) - Docker + Docker Compose(仅 Docker 部署方式需要) ### 方式一:Docker(推荐) @@ -74,7 +74,7 @@ TrueTruth将一段普通的临床问题文本全自动转化为**经过分级的 ```bash git clone https://github.com/FreedomIntelligence/TrueTruth.git cd TrueTruth -cp .env.example .env # 填写 LLM_API_KEY、LLM_BASE_URL、LLM_MODEL、PUBMED_EMAIL +cp .env.example .env # 填写 LLM_API_KEY、LLM_BASE_URL、LLM_MODEL make check-env # 验证配置(可选但推荐) make docker-up # 构建镜像并启动,首次约需 3–5 分钟 # 浏览器打开 http://localhost:8080 @@ -100,7 +100,9 @@ make cli QUERY="68岁男性,NSTEMI合并急性消化道出血,DAPT还是氯 LLM_BASE_URL=xxx LLM_API_KEY=sk-... LLM_MODEL=xxx -PUBMED_EMAIL=your_email@example.com + +# Hypertension RAG 服务地址(默认本地 8000 端口) +HYPERTENSION_API_URL=http://localhost:8000 # 可选——Judge/Scheduling 使用更快的轻量模型,可节省约 30–40% 耗时 # FAST_LLM_MODEL=gpt-4o-mini @@ -130,7 +132,7 @@ ebm5a/ │ ├── agents/ │ │ ├── base.py # 共享三阶段 JSON 恢复 │ │ ├── ask_agent.py # PICO 结构化 + 问题类型识别 -│ │ ├── acquire_agent.py # PubMed 检索 + MedCPT 重排序 +│ │ ├── acquire_agent.py # Hypertension RAG 检索 │ │ ├── appraise_agent.py # GRADE 评价(并行批次) │ │ ├── apply_agent.py # 推荐综合生成 │ │ └── assess_agent.py # 推理链质量审查 @@ -144,9 +146,8 @@ ebm5a/ │ ├── state/ │ │ └── schema.py # 状态数据结构(dataclass + TypedDict) │ ├── tools/ -│ │ ├── pubmed_api.py # PubMed 客户端 + 24h 磁盘缓存 -│ │ ├── medcpt.py # MedCPT Listwise 重排序 -│ │ └── local_evidence_db.py # 本地向量数据库检索 +│ │ ├── hypertension_rag_client.py # Hypertension RAG HTTP 客户端 +│ │ └── medcpt.py # MedCPT 重排序(可选) │ └── config/ │ ├── llm_config.py # get_llm() / get_fast_llm() │ ├── prompts/ # 各 Agent 与 Judge 提示词(.txt) @@ -168,7 +169,6 @@ ebm5a/ │ └── check_env.py # 配置验证脚本(make check-env) ├── tests/ # 测试目录 ├── docs/ # 设计文档与故障排查 -├── data/cache/ # PubMed 查询缓存(自动生成) ├── logs/ # 运行日志(自动生成) ├── Dockerfile.backend ├── Dockerfile.frontend @@ -259,7 +259,5 @@ MIT © Winda0001 — 详见 [LICENSE](LICENSE) ## 致谢 -- [NCBI PubMed](https://pubmed.ncbi.nlm.nih.gov/) — 实时生物医学文献数据库 - [MedCPT](https://github.com/ncbi/MedCPT) — NCBI 开发的生物医学密集检索与重排序模型 -- [LangChain](https://python.langchain.com/) / [LangGraph](https://langchain-ai.github.io/langgraph/) — 智能体编排框架 - [GRADE Working Group](https://www.gradeworkinggroup.org/) — 证据分级方法论 diff --git a/requirements.txt b/requirements.txt index 753bd59..950b40b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,4 @@ -langchain==0.1.0 -langchain-openai==0.0.5 -langchain-community==0.0.20 -langgraph==0.0.20 -requests==2.31.0 -pytest==7.4.3 -pytest-cov==4.1.0 -pytest-mock==3.12.0 +openai==1.109.1 +httpx==0.28.1 python-dotenv==1.0.0 -torch>=2.0.0 -transformers>=4.36.0 -rank-bm25 -chromadb -sentence-transformers +requests==2.31.0 diff --git a/src/agents/acquire_agent.py b/src/agents/acquire_agent.py index 56ee905..ed71635 100644 --- a/src/agents/acquire_agent.py +++ b/src/agents/acquire_agent.py @@ -1,513 +1,130 @@ +"""AcquireAgent — retrieves evidence from the hypertensiondb RAG service. + +Flow: + 1. LLM converts the structured PICO/EBMQuery into a single Chinese + natural-language query string. + 2. HTTP GET /search?q=<query>&top_k=N against hypertensiondb FastAPI. + 3. Aggregate chunk-level results into paper+passages Evidence list. + +PubMed, PMC full-text fetch, BM25 RAG, listwise ranking — all removed. +Domain filtering is handled upstream in AskAgent. +""" from __future__ import annotations -import threading import time -from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple - -from src.agents.base import BaseAgent, robust_parse_json -from src.state.schema import EBMQuery, WorkflowState, Evidence -from src.tools.pubmed_api import fetch_pmc_full_text, search_pubmed -from src.tools.local_evidence_db import search_local - -# Cochrane Handbook Highly Sensitive Search Strategy (HSSS) — -# sensitive version for identifying RCTs and systematic reviews in MEDLINE. -# Used for Therapy and Prevention questions. -# Reference: Cochrane Handbook for Systematic Reviews, Section 4.4. -_HSSS_FILTER = ( - "(randomized controlled trial[pt] OR controlled clinical trial[pt] " - "OR randomized[tiab] OR placebo[tiab] OR randomly[tiab] OR trial[ti] " - "OR systematic[sb])" -) - -# Diagnostic Test Accuracy (DTA) filter for Diagnosis questions. -# Targets sensitivity, specificity, ROC, likelihood ratio, and QUADAS studies. -_DTA_FILTER = ( - '(sensitivity[tiab] OR specificity[tiab] OR "diagnostic accuracy"[tiab] ' - 'OR "likelihood ratio"[tiab] OR "ROC curve"[tiab] OR "area under the curve"[tiab] ' - 'OR QUADAS[tiab] OR "diagnostic test"[tiab] OR "predictive value"[tiab] ' - "OR systematic[sb])" -) - -# Observational filter for Prognosis and Harm questions. -# Allows cohort studies; keeps systematic reviews; drops RCT bias. -_OBSERVATIONAL_FILTER = ( - "(cohort[tiab] OR prospective[tiab] OR retrospective[tiab] " - 'OR "follow-up"[tiab] OR prognosis[tiab] OR survival[tiab] ' - 'OR "risk factor"[tiab] OR incidence[tiab] OR mortality[tiab] ' - "OR systematic[sb])" -) - -# Map question_type to the appropriate PubMed filter -_FILTER_BY_QUESTION_TYPE = { - "Therapy": _HSSS_FILTER, - "Prevention": _HSSS_FILTER, - "Diagnosis": _DTA_FILTER, - "Prognosis": _OBSERVATIONAL_FILTER, - "Harm": _OBSERVATIONAL_FILTER, -} - -# Map EBMQuery route_type to the appropriate PubMed filter -_FILTER_BY_ROUTE_TYPE = { - "ebm_pico": _HSSS_FILTER, - "ebm_peo": _OBSERVATIONAL_FILTER, - "ebm_pird": _DTA_FILTER, - "ebm_prognosis": _OBSERVATIONAL_FILTER, - "full_pipeline": _HSSS_FILTER, # default for generic full_pipeline -} - -# Number of top-K articles to select via listwise ranking. -_TOP_K = 5 - -# --------------------------------------------------------------------------- -# Lazy-loaded sentence-transformer for RAG reranking -# --------------------------------------------------------------------------- -_embedding_model = None -_embedding_lock = threading.Lock() +from typing import Any, Dict, List, Optional - -def _get_embedding_model(): - """Return a shared SentenceTransformer instance (thread-safe lazy init).""" - global _embedding_model # noqa: PLW0603 - if _embedding_model is None: - with _embedding_lock: - if _embedding_model is None: - try: - from sentence_transformers import SentenceTransformer # noqa: PLC0415 - try: - # Use cached model without network check (avoids HuggingFace timeouts) - _embedding_model = SentenceTransformer( - "all-MiniLM-L6-v2", local_files_only=True - ) - except Exception: - # Model not cached yet — download it once - _embedding_model = SentenceTransformer("all-MiniLM-L6-v2") - except Exception: - _embedding_model = None # graceful degradation - return _embedding_model +from src.agents.base import BaseAgent, split_prompt_for_caching +from src.state.schema import EBMQuery, WorkflowState +from src.tools.hypertension_rag_client import RAGConfig, RAGUnavailable, search class AcquireAgent(BaseAgent): - """ - Agent for acquiring evidence from PubMed. - - Evidence selection pipeline: - 1. LLM builds a PubMed Boolean search query (acquire_agent.txt). - 2. PubMed API returns up to 20 candidates (filter chosen by route_type). - 3. PMC full-text is fetched in parallel for open-access articles. - 4. BM25 + Embedding RAG extracts key sentences from full-text articles. - 5. Keyword-based study type inference runs on all candidates. - 6. LLM performs Listwise ranking → Top-K selection. - 7. Full-text articles are promoted to the front of the ranked list. - """ + """Build a natural-language query from PICO and retrieve from hypertensiondb.""" - def __init__(self, llm, ranking_llm=None, tools: List[Any] = None): + def __init__(self, llm, tools: Optional[List[Any]] = None): super().__init__(llm=llm, tools=tools or [], agent_type="Acquire") - self.prompt_template = self._load_prompt("acquire_agent.txt") - self.ranking_prompt_template = self._load_prompt("acquire_ranking.txt") - # Listwise ranking is a classification/sorting task — fast model is sufficient. - self.ranking_llm = ranking_llm or llm - - def _load_prompt(self, filename: str) -> str: - prompt_path = Path(__file__).parent.parent / "config" / "prompts" / filename + prompt_path = Path(__file__).parent.parent / "config" / "prompts" / "acquire_agent.txt" with open(prompt_path, "r", encoding="utf-8") as f: - return f.read() - - def _parse_json(self, content: str) -> dict: - """Parse JSON from LLM response with heuristic error recovery.""" - return robust_parse_json(content) + self.prompt_template = f.read() def _extract_query(self, content: str) -> str: - """Extract search query from LLM response.""" - if "```pubmed" in content: - start = content.find("```pubmed") + 9 + """Extract the natural-language query from the LLM response. + + Expected format: **Query:** ```query <text> ``` + Fallback: first non-empty line after **Query:** marker. + """ + if "```query" in content: + start = content.find("```query") + len("```query") end = content.find("```", start) if end > start: return content[start:end].strip() if "**Query:**" in content: - query_start = content.find("**Query:**") + 10 - remainder = content[query_start:].strip() - if "```" in remainder: - cb_start = remainder.find("```") - after_backticks = remainder[cb_start + 3 :] - newline = after_backticks.find("\n") - if newline >= 0: - after_backticks = after_backticks[newline + 1 :] - cb_end = after_backticks.find("```") - if cb_end >= 0: - return after_backticks[:cb_end].strip() - return remainder.strip() - return content.strip() - - def _use_local_db(self, question_type: str = "Therapy") -> bool: - """Return True to route retrieval through the local obstetrics evidence DB.""" - import os - return os.getenv("USE_LOCAL_DB", "false").lower() == "true" - - def _apply_search_filter(self, query: str, question_type: str = "Therapy", route_type: str = "") -> str: - """Wrap query with an appropriate filter based on route_type (preferred) or question_type.""" - if route_type and route_type in _FILTER_BY_ROUTE_TYPE: - search_filter = _FILTER_BY_ROUTE_TYPE[route_type] - else: - search_filter = _FILTER_BY_QUESTION_TYPE.get(question_type, _HSSS_FILTER) - return f"({query}) AND {search_filter}" - - def _fetch_full_texts(self, candidates: List[Evidence]) -> None: - """Fetch PMC full text for open-access articles in parallel (in-place). - - Only articles with a pmcid are attempted. Results are written directly - to evidence.full_text and evidence.has_full_text. - """ - pmc_candidates = [e for e in candidates if e.pmcid] - if not pmc_candidates: - return - - def _fetch_one(ev: Evidence) -> None: - try: - text = fetch_pmc_full_text(ev.pmid) - if text: - ev.full_text = text - ev.has_full_text = True - except Exception: - pass # non-fatal — abstract-only fallback is fine - - with ThreadPoolExecutor(max_workers=8) as pool: - list(pool.map(_fetch_one, pmc_candidates)) - - n_fetched = sum(1 for e in pmc_candidates if e.has_full_text) - print(f"[DEBUG] PMC full-text fetched: {n_fetched}/{len(pmc_candidates)}") - - def _rag_extract( - self, evidence: Evidence, query_terms: List[str] - ) -> Tuple[str, float]: - """Extract key sentences from full text using BM25 → Embedding rerank. - - Pipeline: - 1. Split full_text into sentences. - 2. BM25 retrieves top-8 candidate sentences. - 3. Embedding model reranks to top-3 by cosine similarity to query. - - Returns (key_sentences_str, relevance_boost) where relevance_boost is - the mean cosine similarity of the top-3 sentences (0.0 if unavailable). - Falls back to abstract if full_text is absent. - """ - text = evidence.full_text or evidence.abstract or "" - if not text: - return "", 0.0 - - # Split into sentences (simple heuristic — good enough for abstracts/paragraphs) - import re - sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if len(s.strip()) > 20] - if not sentences: - return text[:500], 0.0 - - query_str = " ".join(query_terms) - - # BM25 retrieval - try: - from rank_bm25 import BM25Okapi - tokenised = [s.lower().split() for s in sentences] - bm25 = BM25Okapi(tokenised) - scores = bm25.get_scores(query_str.lower().split()) - top8_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:8] - top8 = [sentences[i] for i in top8_idx] - except Exception: - top8 = sentences[:8] - - # Embedding rerank to top-3 - model = _get_embedding_model() - if model is not None and len(top8) > 1: - try: - import numpy as np - query_emb = model.encode([query_str], normalize_embeddings=True)[0] - sent_embs = model.encode(top8, normalize_embeddings=True) - sims = sent_embs @ query_emb - top3_idx = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:3] - top3 = [top8[i] for i in top3_idx] - boost = float(np.mean([sims[i] for i in top3_idx])) - except Exception: - top3 = top8[:3] - boost = 0.0 - else: - top3 = top8[:3] - boost = 0.0 - - return " … ".join(top3), boost - - def _infer_study_type(self, evidence: Evidence) -> str: - """Infer study type from PubMed publication types (primary) then title/abstract keywords (fallback).""" - # --- Primary: PubMed pubtype metadata (authoritative, index-assigned) --- - pub_types = getattr(evidence, "pub_types", None) or [] - pt_lower = {pt.lower() for pt in pub_types} - if "meta-analysis" in pt_lower: - return "Systematic Review" - if "systematic review" in pt_lower: - return "Systematic Review" - if "randomized controlled trial" in pt_lower or "controlled clinical trial" in pt_lower: - return "RCT" - if "clinical trial" in pt_lower: - return "RCT" - if "observational study" in pt_lower or "cohort study" in pt_lower: - return "Cohort Study" - if "case-control study" in pt_lower or "case control study" in pt_lower: - return "Case-Control Study" - if "case reports" in pt_lower: - return "Case Report" - if "review" in pt_lower: - # "Review" pubtype without "Systematic Review" → narrative review - return "Narrative Review" - - # --- Fallback: keyword scan of title + abstract --- - text = f"{evidence.title} {evidence.abstract or ''}".lower() - if "systematic review" in text or "meta-analysis" in text: - return "Systematic Review" - if ( - "randomized controlled trial" in text - or "randomised controlled trial" in text - or "randomized clinical trial" in text - or "randomised clinical trial" in text - or " randomized " in text - or " randomised " in text - ): - return "RCT" - if "cohort study" in text or "cohort" in text: - return "Cohort Study" - if "case-control" in text: - return "Case-Control Study" - if "cross-sectional" in text: - return "Cross-Sectional Study" - if "case report" in text or "case series" in text: - return "Case Report" - return "Other" - - def _listwise_rank( - self, - candidates: List[Evidence], - pico: Dict, - top_k: int = _TOP_K, - ) -> List[Evidence]: - """ - Select and rank Top-K evidence using LLM Listwise strategy. - - The LLM receives all candidates at once and reasons comparatively to - produce a ranked selection — superior to pointwise scoring because it - considers the relative value of each article against the others. - - Returns the selected Evidence objects with rank-normalised - relevance_score (rank 1 → 1.0, rank K → ~0.1, linear decay). - """ - if not candidates: - return [] - - actual_k = min(top_k, len(candidates)) - - # Build candidate block for prompt - lines = [] - for i, e in enumerate(candidates): - study_hint = f"[{e.study_type}] " if e.study_type else "" - abstract_preview = ( - e.key_sentences if e.key_sentences else (e.abstract or "")[:150] - ) - lines.append( - f"[{i + 1}] {study_hint}{e.title}\n" - f" Abstract: {abstract_preview}" - ) - candidate_text = "\n\n".join(lines) - - prompt = self.ranking_prompt_template.format( - patient=pico["patient"], - intervention=pico["intervention"], - comparison=pico["comparison"], - outcome=pico["outcome"], - total=len(candidates), - k=actual_k, - candidates=candidate_text, - ) - - response = self.ranking_llm.invoke(prompt) - print( - f"[DEBUG] Listwise ranking response (first 300 chars): {response.content[:300]}" - ) - - # Parse ranked IDs, validate, deduplicate - try: - ranking_dict = self._parse_json(response.content) - raw_ids = [ - item["id"] - for item in ranking_dict.get("ranked_selection", []) - if isinstance(item.get("id"), int) - ] - except Exception as e: - print(f"[DEBUG] Listwise ranking parse failed ({e}), using original order") - raw_ids = list(range(1, len(candidates) + 1)) - - seen: set = set() - ranked_ids: List[int] = [] - for rid in raw_ids: - if rid not in seen and 1 <= rid <= len(candidates): - seen.add(rid) - ranked_ids.append(rid) - - selected = ranked_ids[:top_k] - n = len(selected) - - result: List[Evidence] = [] - for rank, article_id in enumerate(selected): - evidence = candidates[article_id - 1] # prompt uses 1-based IDs - # Linear rank-normalised score: rank 0 → 1.0, rank n-1 → 0.1 - evidence.relevance_score = round(1.0 - (rank / max(n, 1)) * 0.9, 3) - result.append(evidence) - - return result - - def execute(self, state: WorkflowState) -> Dict[str, Any]: - """Execute Acquire agent: build query → search → full-text → RAG → listwise rank.""" - # Prefer EBMQuery (new routing); fall back to legacy PICOQuery + tail = content.split("**Query:**", 1)[1] + for line in tail.splitlines(): + stripped = line.strip().strip("`") + if stripped: + return stripped + return content.strip().splitlines()[0] if content.strip() else "" + + def _build_pico_dict(self, state: WorkflowState) -> Dict[str, Any]: ebm_query: Optional[EBMQuery] = state.get("ebm_query") pico = state.get("pico_query") - - if ebm_query is None and pico is None: - raise ValueError("No EBMQuery or PICOQuery found in state") - - # Derive a unified pico_dict for the ranking prompt (always needed) if ebm_query is not None: - pico_dict = { + return { "patient": ebm_query.patient, "intervention": ebm_query.primary_focus, "comparison": ebm_query.comparator or "", "outcome": ebm_query.outcome, + "keywords": ebm_query.keywords, } - query_keywords = ebm_query.keywords - route_type = f"ebm_{ebm_query.query_type}" # e.g. "ebm_pico", "ebm_pird" - else: - pico_dict = { + if pico is not None: + return { "patient": pico.patient, "intervention": pico.intervention, "comparison": pico.comparison, "outcome": pico.outcome, + "keywords": pico.keywords, } - query_keywords = pico.keywords - route_type = state.get("route_type") or "" + raise ValueError("AcquireAgent: state has neither ebm_query nor pico_query") + + def execute(self, state: WorkflowState) -> Dict[str, Any]: + pico_dict = self._build_pico_dict(state) backtrack_context = "" if state.get("backtrack_reason"): backtrack_context = ( - f"Previous search failed: {state['backtrack_reason']}\n" - "Please adjust your search strategy accordingly." + f"Previous search returned unsatisfactory results: " + f"{state['backtrack_reason']}\nAdjust the query accordingly." ) - # Step 1: LLM builds Boolean search query + # Step 1: LLM builds natural-language Chinese query prompt = self.prompt_template.format( patient=pico_dict["patient"], intervention=pico_dict["intervention"], comparison=pico_dict["comparison"], outcome=pico_dict["outcome"], - keywords=", ".join(query_keywords), + keywords=", ".join(pico_dict["keywords"]), backtrack_context=backtrack_context, ) - # Split into system + user messages so the static prefix (role, worked - # example, instructions) gets prefix-cached by the gateway. See - # base.split_prompt_for_caching. - from src.agents.base import split_prompt_for_caching prompt = split_prompt_for_caching(prompt) + t0 = time.time() response = self.llm.invoke(prompt) print(f"[TIMING] Acquire query LLM: {time.time()-t0:.1f}s") - base_query = self._extract_query(response.content) - print(f"[DEBUG] Base query: {base_query}") - - # Step 2: Search — local obstetrics DB (full-text) or PubMed fallback - question_type = state.get("question_type") or "Therapy" - search_query_used = "" + query = self._extract_query(response.content) + print(f"[DEBUG] Acquire NL query: {query}") + # Step 2: HTTP /search against hypertensiondb + t0 = time.time() try: - t0 = time.time() - if self._use_local_db(question_type): - print( - f"[DEBUG] question_type={question_type}, routing to local obstetrics DB" - ) - raw_results = search_local(query=base_query, top_k=20) - search_query_used = base_query - print(f"[DEBUG] Local DB returned {len(raw_results)} articles") - print(f"[TIMING] Local DB search: {time.time()-t0:.1f}s") - else: - filtered_query = self._apply_search_filter( - base_query, question_type=question_type, route_type=route_type - ) - print( - f"[DEBUG] route_type={route_type}, question_type={question_type}, " - f"filtered query: {filtered_query}" - ) - raw_results = search_pubmed(query=filtered_query, max_results=20) - print(f"[DEBUG] PubMed (filtered) returned {len(raw_results)} articles") - if len(raw_results) == 0: - print( - "[DEBUG] Filtered query returned 0 results — falling back to base query" - ) - raw_results = search_pubmed(query=base_query, max_results=20) - print(f"[DEBUG] PubMed (base) returned {len(raw_results)} articles") - search_query_used = base_query - else: - search_query_used = filtered_query - print(f"[TIMING] PubMed search (parallel fetch): {time.time()-t0:.1f}s") - except Exception as e: + evidence_list, degraded = search(query) + print(f"[TIMING] hypertensiondb /search: {time.time()-t0:.1f}s") + if degraded: + print(f"[WARN] RAG degraded: {degraded}") + except RAGUnavailable as exc: + print(f"[ERROR] RAG unavailable: {exc}") return { "evidence_list": [], - "search_query": search_query_used, + "search_query": query, "total_results": 0, "selected_count": 0, - "error": str(e), + "error": f"hypertension_api_unavailable: {exc}", + "rag_degraded": None, } - # Step 3: Fetch PMC full texts in parallel for open-access articles - t0 = time.time() - self._fetch_full_texts(raw_results) - print(f"[TIMING] PMC full-text fetch: {time.time()-t0:.1f}s") - - # Step 4: RAG extract key sentences for full-text articles - rag_query_terms = query_keywords or base_query.split()[:10] - for ev in raw_results: - if ev.has_full_text and ev.full_text: - key_sents, boost = self._rag_extract(ev, rag_query_terms) - ev.key_sentences = key_sents - # Slightly boost relevance score for full-text articles (applied after ranking) - ev._rag_boost = boost # type: ignore[attr-defined] - - # Step 5: Infer study type for all candidates (used as hint in ranking prompt) - for evidence in raw_results: - evidence.study_type = self._infer_study_type(evidence) - - print(f"[DEBUG] Study types inferred for {len(raw_results)} candidates") - - # Step 6: LLM Listwise ranking → Top-K selection - t0 = time.time() - selected = self._listwise_rank(raw_results, pico_dict, top_k=_TOP_K) - print(f"[TIMING] Listwise ranking LLM: {time.time()-t0:.1f}s") - - # Step 7: Promote full-text articles to the front (stable sort) - full_text_first = sorted(selected, key=lambda e: 0 if e.has_full_text else 1) - - for i, e in enumerate(full_text_first): - ft_flag = "[FT]" if e.has_full_text else "" + for i, e in enumerate(evidence_list): print( - f"[DEBUG] Rank {i + 1}{ft_flag}: score={e.relevance_score:.3f}, " - f"type={e.study_type}, title={e.title[:80]}..." + f"[DEBUG] Rank {i+1}: score={e.relevance_score:.3f} " + f"id={e.evidence_id} title={(e.title or '')[:60]}... " + f"passages={len(e.supporting_passages)}" ) - print( - f"[DEBUG] Listwise selected {len(full_text_first)}/{len(raw_results)} articles " - f"({sum(1 for e in full_text_first if e.has_full_text)} with full text)" - ) - - study_type_distribution: Dict[str, int] = {} - for e in full_text_first: - t = e.study_type or "Unknown" - study_type_distribution[t] = study_type_distribution.get(t, 0) + 1 return { - "evidence_list": full_text_first, - "search_query": search_query_used, - "total_results": len(raw_results), - "selected_count": len(full_text_first), - "study_type_distribution": study_type_distribution, + "evidence_list": evidence_list, + "search_query": query, + "total_results": len(evidence_list), + "selected_count": len(evidence_list), + "rag_degraded": degraded or None, } diff --git a/src/agents/apply_agent.py b/src/agents/apply_agent.py index f5bda78..ce9906d 100644 --- a/src/agents/apply_agent.py +++ b/src/agents/apply_agent.py @@ -117,9 +117,15 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: evidence_summary = "\n\n".join( [ - f"Evidence {i+1}:\nTitle: {e.title}\nGRADE: {e.grade_level}\n" + f"Evidence {i+1} [{e.evidence_id or 'unknown'}]:\n" + f"Title: {e.title}\n" + f"GRADE: {e.grade_level or 'unknown'}\n" f"Study Type: {e.study_type or 'Unknown'}\n" - f"Key Findings:\n{e.key_sentences or e.abstract or '(无摘要)'}" + f"Year: {e.year or 'unknown'} | Language: {e.language or 'unknown'}\n" + f"Supporting passages:\n" + "\n".join( + f" [{j+1}] [{e.evidence_id or 'unknown'} / {p.section}]\n \"{p.snippet}\"" + for j, p in enumerate(e.supporting_passages) + ) for i, e in enumerate(appraisal.evidence) ] ) @@ -155,7 +161,7 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: backtrack_context=backtrack_context, ) - response = self.llm.invoke(prompt) + response = self.llm.stream_reasoning(prompt, prefix="\n[Apply reasoning] ") try: rec_dict = self._parse_json(response.content) except ValueError: diff --git a/src/agents/appraise_agent.py b/src/agents/appraise_agent.py index 4d66008..ddbb42d 100644 --- a/src/agents/appraise_agent.py +++ b/src/agents/appraise_agent.py @@ -167,30 +167,33 @@ def _parse_json(self, content: str) -> dict: return robust_parse_json(content) def _format_evidence_list(self, evidence_list) -> str: - """Format evidence list for the prompt, including full abstract and pub_types.""" + """Format evidence list for the prompt using supporting passages.""" parts = [] for i, e in enumerate(evidence_list): - abstract = (getattr(e, "abstract", "") or "") - study_type_hint = getattr(e, "study_type", "") or "" - hint_str = ( - f"\nSource DB study_type hint: {study_type_hint}" - if study_type_hint - else "" - ) - # pub_types from PubMed metadata is authoritative for study design. - # Pass it explicitly so the Agent uses it instead of guessing from text. - pub_types = getattr(e, "pub_types", None) or [] - pub_types_str = ( - f"\nPubMed pub_types (authoritative): {', '.join(pub_types)}" - if pub_types - else "" - ) + passages_text = "\n".join( + f" [{j+1}] Section: {p.section} (score: {p.score:.2f})\n \"{p.snippet}\"" + for j, p in enumerate(e.supporting_passages) + ) or " (无 passages)" + + # Pre-computed metadata from frontmatter (authoritative when present) + pre_computed = [] + if e.study_type: + pre_computed.append(f"study_type (pre-computed): {e.study_type}") + if e.grade_level: + pre_computed.append(f"grade_level (pre-computed, DO NOT re-derive): {e.grade_level}") + if e.rob_overall: + pre_computed.append(f"rob_overall (pre-computed, DO NOT re-derive): {e.rob_overall}") + pre_str = "\n".join(pre_computed) if pre_computed else "(无预计算字段,请从 passages 推断)" + parts.append( f"Evidence {i + 1}:\n" f"Title: {e.title}\n" f"Source: {e.source}\n" - f"PMID: {e.pmid}{hint_str}{pub_types_str}\n" - f"Abstract: {abstract}" + f"Evidence ID: {e.evidence_id or 'unknown'}\n" + f"Year: {e.year or 'unknown'} | Language: {e.language or 'unknown'}\n" + f"Tags: {', '.join(e.tags) if e.tags else 'none'}\n" + f"Pre-computed fields:\n{pre_str}\n" + f"Supporting passages:\n{passages_text}" ) return "\n\n".join(parts) @@ -315,16 +318,40 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: graded_evidence = [] for i, evidence in enumerate(evidence_list): - # Match by index (LLM outputs in same order as input) appraisal = study_appraisals[i] if i < len(study_appraisals) else {} study_type = appraisal.get("study_type", "CASE_REPORT") - computed_grade = _compute_grade(appraisal) - # Set grade and sync study_type on the Evidence object - # (overrides acquire_agent keyword-inferred label with Appraise LLM classification) + # If pre-computed grade/rob are available from frontmatter metadata, + # use them directly instead of LLM-derived values. + if evidence.grade_level and evidence.rob_overall: + # Map pre-computed grade string to our internal label + _GRADE_STR_MAP = { + "high": "High", "moderate": "Moderate", + "low": "Low", "very_low": "Very Low", + } + computed_grade = _GRADE_STR_MAP.get( + evidence.grade_level.lower(), evidence.grade_level + ) + # GRADE standard: only "high" RoB (serious bias confirmed) warrants + # downgrade. "some_concerns" means possible bias but impact on + # conclusion is uncertain — do NOT automatically downgrade. + appraisal["risk_of_bias"] = { + "low": "NOT_SERIOUS", + "some_concerns": "NOT_SERIOUS", + "high": "SERIOUS", + }.get(evidence.rob_overall.lower(), "NOT_SERIOUS") + else: + computed_grade = _compute_grade(appraisal) + + # Sync study_type from pre-computed field if available + if evidence.study_type: + # Already set from RAG client metadata + pass + else: + evidence.study_type = _GRADE_CODE_TO_LABEL.get(study_type, study_type) + evidence.grade_level = computed_grade - evidence.study_type = _GRADE_CODE_TO_LABEL.get(study_type, study_type) graded_evidence.append(evidence) # Build rich rationale record for downstream consumers (including Judge) diff --git a/src/agents/ask_agent.py b/src/agents/ask_agent.py index c40a952..938d6f3 100644 --- a/src/agents/ask_agent.py +++ b/src/agents/ask_agent.py @@ -144,6 +144,16 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: self._last_router_payload = self._call_unified_router(question, backtrack_context) route_result = self._last_router_payload + # ── Step 0: Hypertension domain filter ──────────────────────────── + hypertension_related = route_result.get("hypertension_related", True) + domain_rationale = route_result.get("domain_rationale", "") + if not hypertension_related: + logger.info("AskAgent: out-of-domain rejection: %s", domain_rationale) + return self._handle_out_of_domain( + question=question, + domain_rationale=domain_rationale, + ) + route_type = route_result.get("route_type", "full_pipeline") if route_type not in _VALID_ROUTE_TYPES: logger.warning("Unknown route_type '%s', defaulting to full_pipeline", route_type) @@ -200,13 +210,14 @@ def execute(self, state: WorkflowState) -> Dict[str, Any]: # ------------------------------------------------------------------ def _call_unified_router(self, question: str, backtrack_context: str) -> dict: - """Call the unified router prompt. Returns parsed JSON with routing - decision and (for non-Diagnosis full_pipeline) a `query` sub-object.""" + """Call the unified router prompt. Streams the Reasoning section live to + the console so the user sees the routing rationale immediately, then + returns the parsed JSON payload.""" prompt = self._prompts["router_unified"].format( question=question, backtrack_context=backtrack_context, ) - response = self.llm.invoke(prompt) + response = self.llm.stream_reasoning(prompt, prefix="\n[Ask reasoning] ") try: return robust_parse_json(response.content) except ValueError as exc: @@ -218,6 +229,30 @@ def _call_unified_router(self, question: str, backtrack_context: str) -> dict: # Route handlers # ------------------------------------------------------------------ + def _handle_out_of_domain(self, question: str, domain_rationale: str) -> Dict[str, Any]: + """Return a soft-rejection direct answer for non-hypertension questions.""" + if domain_rationale: + body = ( + f"本系统专注于高血压相关的循证医学问题。您的问题主要涉及" + f"**{domain_rationale}**,不在覆盖范围内。建议改问与高血压相关的方面。" + ) + else: + body = ( + "本系统专注于高血压相关的循证医学问题。" + "您的问题不属于高血压领域,建议改问与高血压相关的方面。" + ) + return { + "route_type": "direct_answer", + "route_confidence": 1.0, + "question_type": "Background", + "direct_answer_output": { + "answer": body, + "requires_pipeline": False, + }, + "should_terminate": True, + "out_of_domain": True, + } + def _handle_direct_answer( self, question: str, diff --git a/src/config/llm_config.py b/src/config/llm_config.py index 0a499d3..1637011 100644 --- a/src/config/llm_config.py +++ b/src/config/llm_config.py @@ -191,6 +191,109 @@ def _invoke_blocking(self, messages) -> _LLMResponse: resp.choices[0].message.content, usage=usage, ttft=None, elapsed=elapsed ) + def stream_reasoning(self, prompt, prefix: str = "") -> _LLMResponse: + """Stream the LLM response, printing only the Reasoning section live. + + The LLM output format is: + **JSON:** ```json {...} ``` + --- + ### Reasoning: ... + + This method scans for a reasoning marker and streams that section to + stdout in real time. The full response (JSON + Reasoning) is returned + in _LLMResponse.content so callers can parse JSON from it as usual. + """ + fold_system = os.getenv("EBM_FOLD_SYSTEM_INTO_USER", "1") != "0" + if isinstance(prompt, dict) and "system" in prompt and "user" in prompt: + if fold_system: + messages = [{"role": "user", "content": prompt["system"] + "\n\n" + prompt["user"]}] + else: + messages = [ + {"role": "system", "content": prompt["system"]}, + {"role": "user", "content": prompt["user"]}, + ] + elif isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + else: + messages = [{"role": "user", "content": str(prompt)}] + + t0 = time.time() + ttft = None + chunks: list[str] = [] + usage = None + state = "SCAN" + scan_buf = "" + printed_prefix = False + MARKERS = ("### Reasoning:", "**Reasoning:**", "---\n\n### Reasoning") + # When printing Reasoning that comes BEFORE the JSON block, stop when we + # hit the JSON fence so we don't dump the raw JSON to the console. + JSON_FENCE = "**JSON:**" + + stream = _get_client(self._purpose).chat.completions.create( + model=self._model, + messages=messages, + temperature=self._temperature, + stream=True, + stream_options={"include_usage": True}, + ) + for chunk in stream: + if chunk.choices: + delta = chunk.choices[0].delta if chunk.choices[0] else None + if delta is not None and getattr(delta, "content", None): + token = delta.content + if ttft is None: + ttft = time.time() - t0 + chunks.append(token) + if state == "SCAN": + scan_buf += token + for marker in MARKERS: + if marker in scan_buf: + state = "PRINT" + after = scan_buf.split(marker, 1)[1] + if prefix and not printed_prefix: + print(prefix, end="", flush=True) + printed_prefix = True + if after: + print(after, end="", flush=True) + scan_buf = "" + break + else: + if len(scan_buf) > 200: + scan_buf = scan_buf[-100:] + elif state == "PRINT": + if not printed_prefix: + if prefix: + print(prefix, end="", flush=True) + printed_prefix = True + # Stop printing if we hit the JSON block marker + scan_buf += token + if JSON_FENCE in scan_buf: + state = "DONE" + before_json = scan_buf.split(JSON_FENCE, 1)[0] + if before_json.strip(): + print(before_json, end="", flush=True) + print() + scan_buf = "" + else: + if len(scan_buf) > len(JSON_FENCE) + 5: + print(scan_buf[:-len(JSON_FENCE)], end="", flush=True) + scan_buf = scan_buf[-len(JSON_FENCE):] + # state == "DONE": silently buffer the rest + chunk_usage = getattr(chunk, "usage", None) + if chunk_usage is not None: + usage = chunk_usage + + if state == "PRINT" and scan_buf.strip(): + print(scan_buf, end="", flush=True) + print() + elif state == "PRINT": + print() + + elapsed = time.time() - t0 + content = "".join(chunks) + self._record_telemetry(usage, ttft=ttft, elapsed=elapsed) + return _LLMResponse(content, usage=usage, ttft=ttft, elapsed=elapsed) + def _invoke_streaming(self, messages) -> _LLMResponse: t0 = time.time() ttft = None diff --git a/src/config/prompts/acquire_agent.txt b/src/config/prompts/acquire_agent.txt index 21c1cb7..7e13a05 100644 --- a/src/config/prompts/acquire_agent.txt +++ b/src/config/prompts/acquire_agent.txt @@ -1,41 +1,42 @@ -You are an expert systematic review librarian with extensive experience constructing searches for Cochrane reviews and clinical practice guidelines. You translate clinical PICO questions into comprehensive PubMed Boolean queries, drawing on your deep knowledge of how clinical research is actually reported in the literature. +你是一名循证医学检索专家。你的任务:把结构化 PICO 临床问题转写成**一句中英文混合检索语句**,供后端 RAG 系统检索高血压证据库。 -**Worked Example (study this translation carefully):** +证据库同时收录中英文文献,中英文混合 query 能提升双语召回效果。 -PICO: Patients with AMI / Aspirin / No aspirin / Mortality -Naive (wrong) query: ... AND ("no aspirin" OR "without aspirin") ... -Expert (correct) query: ("Myocardial Infarction"[MeSH] OR "acute myocardial infarction") AND ("Aspirin"[MeSH] OR aspirin OR "acetylsalicylic acid") AND (placebo OR "control group" OR "usual care" OR "standard care" OR "conservative management") AND ("Mortality"[MeSH] OR mortality OR survival OR death) -Why: Clinical trials never describe their control arm as "no aspirin" — they use "placebo", "control", or "usual care". Searching for the literal no-treatment phrase excludes nearly all relevant RCTs. +**关键要求**: +1. 输出为单一检索语句,**不超过 80 字**(含英文括号内容) +2. 核心中文描述 + 括号内补充 1-2 个英文关键词(MeSH 词或常用缩写) +3. 紧扣 PICO 核心:把人群、干预、对比(如有)、结局合成一句 +4. 英文术语直接取自 Keywords 字段,选最相关的 1-2 个放入括号 +5. 不要 Boolean 操作符;不要加研究类型过滤词 -**Instructions:** -1. **Term Expansion**: For each PICO element, identify relevant MeSH terms and keyword synonyms. Use truncation (*) where appropriate. -2. **Comparison Translation**: If the Comparison describes no-treatment or no-intervention (e.g., "no drug", "no aspirin", "placebo", "standard care"), translate it using the terms actually found in clinical trial reports: `placebo OR "control group" OR "usual care" OR "standard care"`. Do not search for the literal no-treatment phrase. -3. **Strategy Construction**: Combine synonyms with OR, and combine PICO concepts with AND. -4. **Refinement**: Explicitly address any "Previous Search Feedback" in your reasoning. -5. **Final Output**: Present the final query string within a designated code block. +**示例**: -**Response Format:** - -**Reasoning:** -<Explain your search strategy here. Which MeSH terms did you choose? How did you handle the Comparison term?> +PICO: +- Patient: 中重度原发性高血压患者 +- Intervention: 缬沙坦联合氨氯地平 +- Comparison: 缬沙坦单药 +- Outcome: 收缩压下降幅度、血压达标率 +- Keywords: hypertension, ARB, amlodipine, blood pressure control, combination therapy +输出: **Query:** -```pubmed -((Patient Term 1 OR Patient Term 2) AND (Intervention Term 1 OR Intervention Term 2)) +```query +缬沙坦联合氨氯地平(ARB + CCB combination)vs 缬沙坦单药治疗中重度高血压的收缩压下降与达标率 ``` +--- + %%USER_INPUT_BELOW%% -Translate the following PICO into a PubMed Boolean query. Output **strictly** in the response format specified above: a brief **Reasoning:** section, then the query inside a ```pubmed code block. Do NOT output a free-form explanation; do NOT propose multiple alternative queries. +请把下面的 PICO 转写成一句中英文混合检索语句,**严格**按上面的输出格式(`**Query:**` + ```query 代码块)。 -**Input Data:** -PICO Components: +**Input:** - Patient: {patient} - Intervention: {intervention} - Comparison: {comparison} - Outcome: {outcome} -- Keywords: {keywords} +- Keywords (选 2-4 个最相关的放入括号): {keywords} **Previous Search Feedback (if any):** {backtrack_context} -(Note: If feedback indicates "0 results", broaden the search by removing restrictive terms or focusing only on P and I. If "too many results", apply stricter filters.) +(如反馈"0 结果",把 query 进一步泛化,去掉过细的剂量/亚组限定,保留核心英文术语) diff --git a/src/config/prompts/acquire_ranking.txt b/src/config/prompts/acquire_ranking.txt deleted file mode 100644 index 263fe78..0000000 --- a/src/config/prompts/acquire_ranking.txt +++ /dev/null @@ -1,47 +0,0 @@ -# Role -你是一名系统评价专家,负责从PubMed检索结果中筛选并排序最相关的证据。 - -# PICO问题 -- Patient(患者/人群): {patient} -- Intervention(干预): {intervention} -- Comparison(对照): {comparison} -- Outcome(结局): {outcome} - -# 候选文献列表(共 {total} 篇) -{candidates} - -# 任务 -从上述 {total} 篇候选文献中,选出最能回答PICO问题的 **Top-{k}** 篇,并按相关性**从高到低**排序。 - -## 选择标准(按优先级排序) - -1. **研究设计层级**(首要考虑) - 系统评价/Meta分析 > RCT > 队列研究 > 病例对照 > 病例报告 - 如果检索到SR或高质量RCT,不必为凑数量而纳入低层级证据。 - -2. **PICO匹配精度** - - Patient:研究人群(年龄、疾病状态)是否与PICO吻合? - - Intervention:干预措施是否与PICO一致(同类药物、同类操作)? - - Outcome:报告的结局是否是临床关心的直接结局(而非代理指标)? - -3. **直接性与可用性** - 优先选择能直接回答问题的证据;若证据来自相似但非完全匹配的人群,也可纳入并在rationale中说明。 - -## 重要规则 -- 如果全部候选文献与PICO均不相关,`ranked_selection` 输出空列表 `[]` -- 不要选择明显不相关的文献(如标题/摘要显示研究的是完全不同的疾病或干预) -- 每篇文献的ID对应上方列表中的数字编号(1 到 {total}) -- 不要重复选择同一篇文献 - -# Output Format -仅输出以下JSON格式,不要包含任何其他文本: - -```json -{{ - "ranked_selection": [ - {{"id": 3, "rationale": "Cochrane SR directly comparing X vs Y in target population, high relevance"}}, - {{"id": 1, "rationale": "RCT matching P/I/O, adequate sample size"}}, - {{"id": 7, "rationale": "Cohort study, similar population, reports key outcome"}} - ] -}} -``` diff --git a/src/config/prompts/apply_agent.txt b/src/config/prompts/apply_agent.txt index 8903845..c946829 100644 --- a/src/config/prompts/apply_agent.txt +++ b/src/config/prompts/apply_agent.txt @@ -1,5 +1,11 @@ You are a clinical recommendation synthesizer operating under strict Evidence-Based Medicine (EBM) principles. +**Citation Rule (MANDATORY):** +Each piece of evidence is provided with its passages pre-labelled as `[evidence_id / section]`. +When you cite a specific finding in your `recommendation` or `rationale`, you MUST append the citation tag immediately after the claim, verbatim from the passage label. +Example: "缬沙坦联合氨氯地平可使收缩压多下降 8.4 mmHg [EV-RCT-2026-PENG-001 / 结果/主要结局]。" +Do NOT invent citation IDs. Only cite passages that actually appear in the Evidence Summary below. + **CRITICAL CONSTRAINT: Evidence Exclusivity** For **Strong**, **Weak**, and **Conditional** recommendations: your recommendation MUST be derived SOLELY from the evidence provided below. You are PROHIBITED from supplementing or overriding the provided evidence with your training knowledge or general clinical guidelines. If the provided evidence contradicts common practice, faithfully represent what the evidence shows. @@ -44,12 +50,17 @@ Determine if the evidence is sufficient to make a recommendation: > **Important**: "Insufficient Evidence" should be reserved for situations where results genuinely conflict or no relevant evidence exists at all. Do NOT output "Insufficient Evidence" simply because evidence quality is Very Low — consistent Very Low evidence still supports a Weak recommendation with caveats. **Step 3 - Recommendation Formulation (only if evidence is sufficient):** -- Strength must strictly follow evidence quality: - - High/Moderate quality + consistent results → "Strong" - - Low quality OR inconsistent results → "Weak" - - Very Low quality → only use if results are consistent and no better evidence exists; mark as "Weak" with explicit caveats - - Indirect evidence only (different population / surrogate endpoint / similar intervention) → "Conditional" (state what indirect evidence supports it and why direct evidence is lacking) - - Only expert consensus / guidelines, no direct study evidence → "Consensus-based" (cite the specific guideline or consensus) +- Strength must follow GRADE principles (Guyatt et al., BMJ 2011): + - **High/Moderate quality + consistent results + benefits clearly outweigh harms → "Strong"** + - Note: indirectness (partial population mismatch, surrogate outcomes) affects evidence *quality*, not *strength* on its own. If the adopted evidence is Moderate and direction is consistent with clear benefit, give Strong and note limitations in caveats. + - **Moderate quality + serious inconsistency across studies (conflicting directions), OR benefits vs. harms genuinely uncertain → "Conditional"** + - **Low quality + consistent results (same direction) → "Conditional"** (consistent Low evidence still supports action) + - **Low quality + inconsistent results → "Weak"** + - **Very Low quality + consistent results → "Conditional"** with explicit uncertainty caveats + - **Very Low quality + inconsistent or conflicting results → "Weak"** + - Indirect evidence only (different population / surrogate endpoint / similar intervention) → "Conditional" + - Only expert consensus / guidelines, no direct study evidence → "Consensus-based" +- **Important**: Population mismatch, limited sample size, short follow-up, indirectness — document these in `caveats`. They affect evidence quality, not recommendation strength on their own. Only downgrade strength when benefits vs. harms are genuinely uncertain or results conflict. - Do NOT upgrade strength based on clinical intuition or training knowledge. **Step 4 - Determine Adopted Evidence Quality:** diff --git a/src/config/prompts/appraise_agent.txt b/src/config/prompts/appraise_agent.txt index 0462227..c34aa5c 100644 --- a/src/config/prompts/appraise_agent.txt +++ b/src/config/prompts/appraise_agent.txt @@ -2,10 +2,14 @@ **你的任务:** 对每篇证据的GRADE评价因素进行分类判断,只输出结构化的分类标签,**不要自己计算或写出最终等级**(最终GRADE等级由系统代码根据你的分类自动计算)。 +**重要规则:若某篇证据的"Pre-computed fields"中已提供 `grade_level (pre-computed)` 或 `rob_overall (pre-computed)` 字段,你必须直接使用这些值,不得重新推断或覆盖。对应 JSON 输出中填入相同值即可。只对"无预计算字段"的证据进行推断。** + --- ## 输入数据 +每篇证据以如下格式提供:标题、语言、年份、标签,以及从证据库中检索到的若干 supporting passages(各附所属节区名称和相关性分数)。**请基于这些 passages 中的内容进行评价,不要编造 passages 中未提及的数据。** + **证据列表:** {evidence_list} diff --git a/src/config/prompts/ask/router_unified.txt b/src/config/prompts/ask/router_unified.txt index 6fba2f7..4626fd0 100644 --- a/src/config/prompts/ask/router_unified.txt +++ b/src/config/prompts/ask/router_unified.txt @@ -1,10 +1,16 @@ -You are a clinical question triage and structuring expert for an Evidence-Based Medicine (EBM) decision-support system. +You are a clinical question triage and structuring expert for an Evidence-Based Medicine (EBM) decision-support system specialized in **hypertension**. -Your job has **two parts** in a single response: -1. **Route classification**: classify the question into one of three categories, and identify question type + EBM framework. -2. **Query structuring** (only when applicable): for non-Diagnosis full-pipeline questions, also produce the structured PICO/PEO/Prognosis query so the downstream pipeline can search literature directly. +Your response has two parts: (1) route + domain classification, (2) structured query when applicable. -Diagnosis questions and sub-question decompositions do NOT include structured query in this step — they are handled by follow-up prompts. +--- + +## Step 0 — Domain Filter + +Set `hypertension_related=true` if the question involves: hypertension, BP management, antihypertensives, hypertension complications/comorbidities (incl. CKD, diabetes, pregnancy, stroke, CAD), white coat/masked/secondary hypertension, lifestyle for BP, TCM for hypertension, or any borderline BP question. + +Set `hypertension_related=false` **only** when completely unrelated to hypertension (e.g., pure oncology, orthopedics, diabetes with no BP angle). When false, set `route_type=direct_answer`; all other fields null/default. + +Output `domain_rationale` in ≤20 words, same language as input. --- @@ -105,6 +111,8 @@ Select the most appropriate query framework: **JSON:** ```json {{ + "hypertension_related": true, + "domain_rationale": "≤30 word rationale in input language", "route_type": "direct_answer | full_pipeline | sub_questions", "route_confidence": 0.0, "question_type": "Therapy | Diagnosis | Prognosis | Harm | Prevention | Background | Mixed", diff --git a/src/config/prompts/assess_agent.txt b/src/config/prompts/assess_agent.txt index 28261e5..38845df 100644 --- a/src/config/prompts/assess_agent.txt +++ b/src/config/prompts/assess_agent.txt @@ -33,9 +33,10 @@ Is the recommendation strength consistent with the evidence quality per GRADE pr ## Dimension 3 — Reasoning Chain Are there logical gaps between the evidence cited in the rationale and the final recommendation? -- `COMPLETE`: Every recommendation point is traceable to cited evidence; no unsupported leaps. -- `WEAK`: Minor inferential gaps or one unsupported detail, but the overall reasoning is traceable. -- `BROKEN`: Recommendation contradicts the cited evidence, or key claims have no traceable evidence basis. +Citation tags in the format `[evidence_id / section]` are the evidence trail — check that they appear in the rationale for factual claims. +- `COMPLETE`: Every recommendation point is traceable to cited evidence (with citation tags); no unsupported leaps. +- `WEAK`: Minor inferential gaps, citation tags present but sparse, or one unsupported detail. +- `BROKEN`: Recommendation contradicts the cited evidence, key claims have no citation tags, or citation IDs appear fabricated. ## Dimension 4 — Caveats Are important limitations, applicability constraints, and knowledge gaps documented? diff --git a/src/config/prompts/judge/acquire_judge.txt b/src/config/prompts/judge/acquire_judge.txt index 86ea280..356a037 100644 --- a/src/config/prompts/judge/acquire_judge.txt +++ b/src/config/prompts/judge/acquire_judge.txt @@ -10,8 +10,8 @@ Acquire Agent 输出(已排序的证据列表):{stage_output} # 预处理:系统错误检测 -首先检查输入数据中是否包含 error 字段(如 "error": "Connection timeout"): -如果存在 error 字段,说明 PubMed API 调用本身失败,与检索词无关。 +首先检查输入数据中是否包含 error 字段(如 "error": "hypertension_api_unavailable: ..."): +如果存在 error 字段,说明 hypertensiondb RAG 服务本身失败,与检索词无关。 此时跳过所有审计项,直接输出:search_terms_valid=YES,所有 rubric 填 NA,search_exhausted=false,failures=[],overall_quality=pass。 # 一票否决项(Gate) @@ -30,10 +30,10 @@ Acquire Agent 输出(已排序的证据列表):{stage_output} - ebm_prognosis:Prognostic Factor ## R1. keywords_cover_pico_dimensions【Critical,权重3】 -关键词是否覆盖 P + 主焦点维度,且至少含一个可在 MeSH 验证的标准词? -- YES:覆盖 P + 主焦点维度,且含 MeSH 标准词 -- PARTIAL:覆盖了 P 或主焦点之一,但另一维度无对应关键词;或有覆盖但无 MeSH 标准词 -- NO:关键词全部指向同一概念,未覆盖多个维度 +自然语言中文检索语句(search_query 字段)是否覆盖 P + 主焦点维度? +- YES:query 紧扣 P 和主焦点,含具体的疾病/干预/对比术语 +- PARTIAL:query 覆盖了 P 或主焦点之一,但另一维度模糊 +- NO:query 方向偏离,未紧扣 PICO 主焦点 ## R2. primary_focus_match【Critical,权重3】 基于证据列表中主焦点匹配度最好的那篇证据判断:证据中的核心干预/暴露/测试是否与查询主焦点维度匹配? @@ -54,16 +54,16 @@ Acquire Agent 输出(已排序的证据列表):{stage_output} - NO:未报告任何相关结局 ## R4. keywords_have_synonyms【Major,权重2】 -核心概念是否有同义词/变体(如 SGLT2i + empagliflozin + dapagliflozin)? -- YES:有同义词/变体 -- PARTIAL:有部分同义词但不完整 -- NO:无任何同义词扩展,仅有单一术语 +检索语句是否包含核心概念的同义表达或别名(如同时提到"缬沙坦"和"ARB"),或语义足够丰富以匹配多种表述? +- YES:query 自然涵盖核心概念的常见同义表达 +- PARTIAL:仅提到单一术语但语义已足够明确 +- NO:query 极短/缩略,未覆盖任一同义表达 ## R5. keywords_count_sufficient【Major,权重2】 -关键词数量是否充足? -- YES:≥ 5 个 -- PARTIAL:3-4 个 -- NO:≤ 2 个 +检索语句是否表意完整(包含人群+干预+结局至少三个语义要素)? +- YES:包含 ≥3 个语义要素 +- PARTIAL:包含 2 个语义要素 +- NO:仅 1 个语义要素或纯关键词堆砌 ## R6. study_design_matches_route【Major,权重2】 纳入文献的研究设计是否与 route_type 的优先级匹配? @@ -99,17 +99,17 @@ Acquire Agent 输出(已排序的证据列表):{stage_output} - NO:明显不合理(大量高质量候选却只选1-2篇,或质量极差仍凑满10篇) ## R10. key_sentences_present【Minor,权重1】 -Top 文章的 key_sentences 字段是否有实质内容? -- YES:Top 文章的 key_sentences 非空,RAG 流程正常执行 -- PARTIAL:部分文章 key_sentences 为空(摘要极短导致 chunk 失败) -- NO:所有文章 key_sentences 均为空,RAG 流程可能失败 +Top 文章是否带有 supporting_passages(含 section + snippet)字段? +- YES:所有 Top 文章 supporting_passages 均非空,且 snippet 与查询相关 +- PARTIAL:部分文章 supporting_passages 为空或 snippet 与查询关系较弱 +- NO:所有 Top 文章 supporting_passages 均为空,RAG 检索可能失败 -## 5. 全文与 RAG 质量审计(full_text_audit) +## 5. RAG 服务降级审计(rag_degraded_audit) -**full_text_coverage**:Top 文章(排名前3)中,has_full_text=True 的比例是否合理? -- `GOOD`:≥2/3 篇有全文 -- `PARTIAL`:1/3 篇有全文,或全文获取部分失败 -- `NONE`:Top 3 篇均无全文 +**rag_degraded**:响应中是否报告了组件降级? +- `OK`:rag_degraded 为空或 None,所有组件正常 +- `PARTIAL`:dense 或 rerank 单路降级,结果仍可用 +- `MAJOR`:两路均降级或服务不可达 # Output Format 仅输出以下 JSON,不要包含任何其他文本: @@ -133,9 +133,12 @@ Top 文章的 key_sentences 字段是否有实质内容? "key_sentences_present": "YES | PARTIAL | NO" }}, "full_text_audit": {{ - "full_text_coverage": "GOOD | PARTIAL | NONE", + "full_text_coverage": "NA", "key_sentences_present": "YES | PARTIAL | NO" }}, + "rag_degraded_audit": {{ + "rag_degraded": "OK | PARTIAL | MAJOR" + }}, "search_exhausted": false, "failures": ["具体失败项及原因(无失败则为空列表)"], "overall_quality": "pass | fail | gate_fail" diff --git a/src/config/prompts/judge/apply_judge.txt b/src/config/prompts/judge/apply_judge.txt index ee6e338..8eca597 100644 --- a/src/config/prompts/judge/apply_judge.txt +++ b/src/config/prompts/judge/apply_judge.txt @@ -44,13 +44,19 @@ Apply 的维度一致性检查是否使用了与 route_type 匹配的框架? 注意:inconsistency=SERIOUS 时 Moderate→Weak 的降强推荐属正确行为,不应标注为不匹配。 -EBM原则(基于 evidence_quality): -- evidence_quality=High/Moderate + 结果一致 → Strong 合理 -- evidence_quality=Low 或结果不一致 → Weak 合理 -- evidence_quality=Very Low + 结果一致 → Weak 合理(有明确 caveats) +EBM原则(基于 evidence_quality,严格遵循 GRADE Guyatt et al. 2011): +- evidence_quality=High/Moderate + 结果一致 + 效益明显大于害处 → **Strong 合理** + - 注意:indirectness(人群不完全匹配、代理结局)影响证据质量,但不单独导致强度降为 Conditional。若采纳证据为 Moderate 且方向一致,Strong 是正确的。 +- evidence_quality=Moderate + 研究间结论冲突(相反方向),或效益与害处真正不确定 → Conditional 合理 +- evidence_quality=Low + 结果一致 → **Conditional 合理**(不是 Weak) +- evidence_quality=Low + 结果不一致 → Weak 合理 +- evidence_quality=Very Low + 结果一致 → Conditional 合理(附明确不确定性说明) +- evidence_quality=Very Low + 结果不一致/冲突 → Weak 合理 - 仅间接证据 → Conditional 合理 - 仅专家共识/指南 → Consensus-based 合理 +注意:人群不完全匹配、样本量有限、随访短、indirectness — 这些写进 caveats,不构成单独降低推荐强度的理由。 + - YES:推荐强度与 `evidence_quality` 严格匹配(含上述特殊情况) - PARTIAL:有轻微偏差(如 Moderate 证据给 Strong,但结果高度一致且无 inconsistency 问题),临床上可接受 - NO:推荐强度与 `evidence_quality` 明显不符(不触发 gate 的中等程度不匹配) @@ -68,10 +74,10 @@ EBM原则(基于 evidence_quality): - NO:未提及不确定性,或仅说"证据有限"而无来源说明 ## R5. citation_traceable【Major,权重2】 -推荐依据是否有文献溯源(PMID 或标题可追溯)? -- YES:推荐依据有文献溯源 -- PARTIAL:部分推荐有溯源,部分缺失 -- NO:无任何文献溯源 +推荐文本是否使用 `[evidence_id / section]` 格式对关键结论进行了引用,且引用的 evidence_id 与 section 确实出现在提供的证据列表中? +- YES:每条事实性结论后均有 `[EV-... / section]` 引用,且 evidence_id + section 均可在证据列表中找到对应 passage +- PARTIAL:大部分关键结论有引用,但部分缺失;或个别 evidence_id 无法在证据列表中精确匹配 +- NO:完全未使用 `[evidence_id / section]` 引用格式;或大量引用 ID 是 Apply 编造的(不在证据列表内) ## R6. recommendation_specific【Minor,权重1】 推荐内容是否足够具体,临床医生可据此执行(含适应症、关键参数等)? diff --git a/src/config/prompts/judge/appraise_judge.txt b/src/config/prompts/judge/appraise_judge.txt index 422c54e..cb6b32e 100644 --- a/src/config/prompts/judge/appraise_judge.txt +++ b/src/config/prompts/judge/appraise_judge.txt @@ -16,18 +16,20 @@ Appraise Agent 输出(包含分类标签和计算结果):{stage_output} ## G1. study_type_correct 所有研究的 study_type 识别是否正确? -**判断优先级**:Appraise Agent 优先使用 PubMed `pub_types` 元数据(权威字段,由 PubMed 索引人员标注)。判断时应遵循以下规则: -- 若 Agent 的 study_type 与 `pub_types` 一致 → **必须判 YES**,即使摘要文字描述不够清晰 -- 若 `pub_types` 缺失,Agent 依据摘要文字推断 → 只要推断方向合理(如摘要有"randomized"字样标为RCT)即判 YES -- 判 NO 的条件:**摘要明确、直接地说明了与 pub_types 相反的设计**(如 pub_types 写 RCT,但摘要第一句写"This is an observational cohort study");或研究类型在 GRADE 合法范围之外 +**判断依据**:Appraise Agent 基于证据库提供的 supporting_passages(含 section 与 snippet)+ 文献标签(tags)判定 study_type。判断时应遵循以下规则: +- 若 supporting_passages 中明确提及"randomized"、"placebo-controlled"等关键词 → Agent 标 RCT 即判 YES +- 若 passages 描述系统检索方法 + 纳入标准 → Agent 标 SYSTEMATIC_REVIEW / META_ANALYSIS 即判 YES +- 若 passages 描述前瞻性随访人群 → Agent 标 COHORT 即判 YES +- 若 tags 中含明确研究设计标识(如 "RCT"、"systematic_review"、"guideline"),与 Agent 标签一致 → 判 YES +- 判 NO 的条件:passages 明确、直接地说明了与 Agent 标签相反的设计(如 Agent 标 RCT,但 passages 第一句写"This is an observational cohort study");或研究类型在 GRADE 合法范围之外 **常见合理情形(应判 YES)**: -- pub_types 含 `Randomized Controlled Trial` → Agent 标 RCT → YES(即使摘要提到电子病历数据) -- pub_types 含 `Meta-Analysis` → Agent 标 META_ANALYSIS → YES -- pub_types 缺失,摘要含 "cohort" → Agent 标 COHORT → YES +- passages 含 "randomized, double-blind" → Agent 标 RCT → YES +- passages 描述 PRISMA 流程 + 异质性检验 → Agent 标 META_ANALYSIS → YES +- passages 描述长期随访队列 → Agent 标 COHORT → YES - YES:所有研究的 study_type 识别符合上述规则 -- NO:存在明显错误,且摘要有直接相反证据(如 pub_types=RCT 但摘要明确写"观察性研究") +- NO:存在明显错误,且 passages 有直接相反证据 ## G2. computed_grade_reasonable 给定 Appraise Agent 输出的分类标签(study_type、included_study_type、各降级/升级因素),系统计算出的最终 GRADE 等级在数学逻辑上是否正确? @@ -42,7 +44,7 @@ Appraise Agent 输出(包含分类标签和计算结果):{stage_output} - YES:给定这些标签,计算出的等级符合 GRADE 数学规则 - NO:计算路径**明显**错误(如标签显示 RCT + 无任何降级因素,但输出 Very Low;或 COHORT 标了 large_effect=YES 且 risk_of_bias=NOT_SERIOUS,但等级未升级) - UNCERTAIN:满足以下任一条件时使用,**降级为 MAJOR 问题,不触发 Gate**: - - 摘要截断,无法确认 study_type 或关键降级因素 + - supporting passages截断,无法确认 study_type 或关键降级因素 - 研究设计存在歧义(如"target trial emulation"、SR内含NMA等混合设计) - 降级因素标注本身存在争议(如 risk_of_bias 的严重程度有合理分歧),导致预期等级范围不确定 @@ -57,22 +59,22 @@ Appraise Agent 输出(包含分类标签和计算结果):{stage_output} # Rubric 评分项 ## R1. downgrade_factors_appropriate【Critical,权重3】 -四个降级因素(risk_of_bias/inconsistency/indirectness/imprecision)的严重程度标注是否与摘要信息相符? +四个降级因素(risk_of_bias/inconsistency/indirectness/imprecision)的严重程度标注是否与supporting passages信息相符? **GRADE标准下的合理行为(判YES或PARTIAL,不判NO):** -- 摘要截断导致无法确认效应量/CI/样本量 → 标注 `imprecision=SERIOUS` 是合理的保守判断 +- supporting passages截断导致无法确认效应量/CI/样本量 → 标注 `imprecision=SERIOUS` 是合理的保守判断 - 单篇研究标注 `inconsistency=NA` 是**唯一正确**做法(见下方强制规则) - NARRATIVE_REVIEW/EXPERT_OPINION 各因素标注 `NA` 是正确做法 - 大型双盲安慰剂对照 RCT 标注 `risk_of_bias=NOT_SERIOUS` 是正确做法(见下方豁免规则) -**risk_of_bias 豁免规则**:摘要已明确描述"double-blind, placebo-controlled"或"randomized, double-blind",或大型多中心 RCT(>1000人)且明确双盲/安慰剂对照 → 标注 `NOT_SERIOUS` 正确,不应因"未详述分配隐藏方法"而判为错误。 +**risk_of_bias 豁免规则**:supporting passages已明确描述"double-blind, placebo-controlled"或"randomized, double-blind",或大型多中心 RCT(>1000人)且明确双盲/安慰剂对照 → 标注 `NOT_SERIOUS` 正确,不应因"未详述分配隐藏方法"而判为错误。 **inconsistency 强制规则**:单篇独立研究(RCT/COHORT/CASE_CONTROL/CROSS_SECTIONAL)的 `inconsistency` **必须为 NA**。若 Agent 对单篇研究标注了 NOT_SERIOUS/SERIOUS/VERY_SERIOUS,这是错误的,应判 NO。 **indirectness 判断基准**:只评估**该研究本身**的 PICO 与目标 PICO 的匹配程度。不得将"证据列表中其他研究 PICO 不同"作为该研究 indirectness 的判断依据。 **判 NO 的条件(明显违反GRADE标准):** -- 摘要明确描述了双盲/安慰剂对照,仍标注 `risk_of_bias=SERIOUS/VERY_SERIOUS` +- supporting passages明确描述了双盲/安慰剂对照,仍标注 `risk_of_bias=SERIOUS/VERY_SERIOUS` - 单篇 RCT/COHORT 等研究的 `inconsistency` 标注为 NOT_SERIOUS/SERIOUS/VERY_SERIOUS(应为 NA) - 将证据列表中其他研究的 PICO 差异作为本研究 `indirectness` 降级的理由 - 升级因素(large_effect、dose_response)被错误地用于 RCT 或 SR/MA(应填 NA) @@ -87,16 +89,16 @@ Appraise Agent 输出(包含分类标签和计算结果):{stage_output} ## R2. included_study_type_correct【Critical,权重3】 (仅当证据列表含 SYSTEMATIC_REVIEW/META_ANALYSIS/NMA 时判断,否则填 NA) -SR/MA/NMA 的 included_study_type 字段是否与摘要描述的纳入研究类型相符? -- YES:字段与摘要描述的纳入研究类型相符(如摘要明确描述"纳入RCT"→ RCT) -- PARTIAL:摘要信息不足以确认(如摘要未描述纳入类型 → UNKNOWN 是合理选择) -- NO:明显错误(如摘要写"仅纳入RCT"但标注为 OBSERVATIONAL) +SR/MA/NMA 的 included_study_type 字段是否与supporting passages描述的纳入研究类型相符? +- YES:字段与supporting passages描述的纳入研究类型相符(如supporting passages明确描述"纳入RCT"→ RCT) +- PARTIAL:supporting passages信息不足以确认(如supporting passages未描述纳入类型 → UNKNOWN 是合理选择) +- NO:明显错误(如supporting passages写"仅纳入RCT"但标注为 OBSERVATIONAL) - NA:证据列表中没有 SR/MA/NMA 类型研究 ## R3. upgrade_factors_appropriate【Major,权重2】 (仅当证据列表含 COHORT/CASE_CONTROL 时判断,否则填 NA) -升级因素(large_effect/dose_response/confounding_bias_mitigates)的标注是否与摘要信息相符? -- YES:升级因素的 YES/NO 标注与摘要信息相符 +升级因素(large_effect/dose_response/confounding_bias_mitigates)的标注是否与supporting passages信息相符? +- YES:升级因素的 YES/NO 标注与supporting passages信息相符 - PARTIAL:整体合理,个别因素有轻微偏差 - NO:明显错误(如无明确剂量效应数据但标注 dose_response=YES) - NA:证据列表中没有 COHORT/CASE_CONTROL 研究 @@ -115,10 +117,10 @@ SR/MA/NMA 的 included_study_type 字段是否与摘要描述的纳入研究类 - NO:存在明显冲突但完全未识别 ## R6. numerical_data_extracted【Minor,权重1】 -摘要中存在效应量/CI/P值时,是否均被提取? -- YES:data_available 的判断准确,能识别摘要中存在的数值指标 +supporting passages中存在效应量/CI/P值时,是否均被提取? +- YES:data_available 的判断准确,能识别supporting passages中存在的数值指标 - PARTIAL:判断基本合理,有轻微偏差 -- NO:摘要有明确效应量但标记为未提取 +- NO:supporting passages有明确效应量但标记为未提取 # Output Format 仅输出以下 JSON,不要包含任何其他文本: diff --git a/src/config/prompts/judge/ask_judge.txt b/src/config/prompts/judge/ask_judge.txt index 38a5484..1f2576c 100644 --- a/src/config/prompts/judge/ask_judge.txt +++ b/src/config/prompts/judge/ask_judge.txt @@ -23,12 +23,13 @@ route_type 与问题类型是否匹配? ## G3. nonresearch_classification_correct(仅当 route_type = direct_answer 时判断,否则填 NA) 判断 direct_answer 路由是否合适——即问题是否确实不需要文献检索来回答。 满足以下任一情形即视为合适: -1. **事实/定义性问题**:答案来自已建立的医学知识(如"LVEF 代表什么"、"血清钠正常范围"、"GRADE 等级有哪些") -2. **药物剂量 / 药代动力学 / 已确立的临床阈值**:查询类问题,答案为公认数值 -3. **紧急操作性指导**:要求立即操作(如何处理/立即给/紧急处置),延迟回答直接危及生命,答案来自公认标准流程(BLS/ACLS/指南操作章节) -4. **明确超出临床 EBM 范畴**:管理、计费、非临床问题 -- YES:问题属于以上任一类型,文献检索不会增加额外价值 -- NO:问题实际上需要循证评估(治疗有效性、诊断准确性、预后、危害、预防等),应重路由到 EBM 流程 +1. **领域外软拒绝**:Ask Agent 输出中 `out_of_domain=true`,且问题确实与高血压无关(如纯糖尿病、肿瘤、骨科等) +2. **事实/定义性问题**:答案来自已建立的医学知识(如"LVEF 代表什么"、"血清钠正常范围"、"GRADE 等级有哪些") +3. **药物剂量 / 药代动力学 / 已确立的临床阈值**:查询类问题,答案为公认数值 +4. **紧急操作性指导**:要求立即操作(如何处理/立即给/紧急处置),延迟回答直接危及生命,答案来自公认标准流程(BLS/ACLS/指南操作章节) +5. **明确超出临床 EBM 范畴**:管理、计费、非临床问题 +- YES:问题属于以上任一类型,文献检索不会增加额外价值;或 out_of_domain 软拒绝判定正确 +- NO:问题实际上需要循证评估(治疗有效性、诊断准确性、预后、危害、预防等),应重路由到 EBM 流程;或 out_of_domain 误判(高血压问题被错误软拒绝) - NA:route_type != direct_answer,不适用 # Rubric 评分项(仅适用于 EBM 路由;direct_answer 路由时所有 rubric 填 NA) diff --git a/src/coordinator/coordinator.py b/src/coordinator/coordinator.py index 729d2a7..bdbf499 100644 --- a/src/coordinator/coordinator.py +++ b/src/coordinator/coordinator.py @@ -75,7 +75,7 @@ def initialize_state(self, question: str) -> WorkflowState: sub_question_total=None, ) - def execute_agent(self, agent_name: str, state: WorkflowState) -> WorkflowState: + def execute_agent(self, agent_name: str, state: WorkflowState, on_agent_complete=None) -> WorkflowState: """Execute a single agent and update state""" agent = self.agents[agent_name] @@ -96,6 +96,13 @@ def execute_agent(self, agent_name: str, state: WorkflowState) -> WorkflowState: for key, value in result.items(): state[key] = value + # Notify caller immediately after agent output is ready (before Judge) + if on_agent_complete is not None: + try: + on_agent_complete(agent_name, state) + except Exception: + pass + # Generate observe using Judge LLM t0 = time.time() observe = self.judge_llm.evaluate_stage(agent_name, result, state) @@ -230,7 +237,7 @@ def handle_scheduling_decision( return state - def execute_workflow(self, question: str) -> WorkflowState: + def execute_workflow(self, question: str, on_stage_complete=None) -> WorkflowState: """Execute complete workflow with Judge and Scheduling LLMs""" state = self.initialize_state(question) workflow_start = time.time() @@ -241,8 +248,10 @@ def execute_workflow(self, question: str) -> WorkflowState: if current_step is None: break - # Execute current agent (includes Judge timing inside execute_agent) - state = self.execute_agent(current_step, state) + # Execute current agent — on_stage_complete fires inside execute_agent + # immediately after agent output is ready (before Judge), so the user + # sees Ask results ~12s earlier than waiting for Judge to finish. + state = self.execute_agent(current_step, state, on_agent_complete=on_stage_complete) # ── Direct-answer early exit ──────────────────────────────────────── # If the Ask agent decided the question can be answered directly from diff --git a/src/judge/judge_llm.py b/src/judge/judge_llm.py index d1df7ae..693422a 100644 --- a/src/judge/judge_llm.py +++ b/src/judge/judge_llm.py @@ -175,17 +175,17 @@ def _appraise_layer1_check(output: Dict) -> Dict: study_type = ev.get("study_type") if not study_type or study_type not in LEGAL_STUDY_TYPES: failures.append( - f"study_type missing or illegal: pmid={ev.get('pmid', '?')} study_type={study_type}" + f"study_type missing or illegal: evidence_id={ev.get('evidence_id', '?')} study_type={study_type}" ) rob = ev.get("risk_of_bias") if rob is None: - failures.append(f"risk_of_bias missing: pmid={ev.get('pmid', '?')}") + failures.append(f"risk_of_bias missing: evidence_id={ev.get('evidence_id', '?')}") grade = ev.get("grade_level") if grade and grade not in LEGAL_GRADES: raise SystemError( - f"grade_output_in_legal_range FAILED: pmid={ev.get('pmid', '?')} grade={grade}. " + f"grade_output_in_legal_range FAILED: evidence_id={ev.get('evidence_id', '?')} grade={grade}. " "Illegal grade value — workflow terminated." ) @@ -883,19 +883,23 @@ def _prepare_context( evidence_list = raw_output.get("evidence_list", []) condensed_evidence = [] for i, e in enumerate(evidence_list): + passages = getattr(e, "supporting_passages", None) or [] condensed_evidence.append( { "id": i + 1, "title": e.title if hasattr(e, "title") else str(e), "source": getattr(e, "source", ""), - "pmid": getattr(e, "pmid", ""), + "evidence_id": getattr(e, "evidence_id", None), "study_type": getattr(e, "study_type", "Unknown"), "relevance_score": getattr(e, "relevance_score", 0.0), - # has_full_text and key_sentences let the Judge verify - # key_sentences_present without guessing from other fields. - "has_full_text": getattr(e, "has_full_text", False), - "has_key_sentences": bool(getattr(e, "key_sentences", None)), - "abstract_preview": (getattr(e, "abstract", "") or "")[:200], + "tags": getattr(e, "tags", None) or [], + "language": getattr(e, "language", ""), + "year": getattr(e, "year", None), + "passage_count": len(passages), + "passages_preview": [ + {"section": p.section, "snippet": (p.snippet or "")[:200]} + for p in passages[:2] + ], } ) # Truncate search_query: full Boolean PubMed queries are 500-1000+ chars, @@ -925,13 +929,15 @@ def _prepare_context( { "title": e.title, "source": e.source, - "pmid": e.pmid, + "evidence_id": e.evidence_id, "relevance_score": e.relevance_score, - # pub_types is the authoritative study design field used by AppraiseAgent. - # Including it here lets the Judge verify study_type using the same - # source of truth, eliminating abstract-text vs metadata divergence. - "pub_types": getattr(e, "pub_types", None) or [], - "abstract": (getattr(e, "abstract", "") or "")[:300], + "tags": e.tags or [], + "language": e.language, + "year": e.year, + "supporting_passages": [ + {"section": p.section, "snippet": (p.snippet or "")[:300], "score": p.score} + for p in (e.supporting_passages or []) + ], } for e in evidence_list ], diff --git a/src/main.py b/src/main.py index a98f9f9..42f7db1 100644 --- a/src/main.py +++ b/src/main.py @@ -34,10 +34,13 @@ def _warmup_llms() -> None: - """Fire one minimal request to each LLM purpose in parallel so HTTP - connections and the upstream model are pre-warmed before the pipeline's - first real call. Failures are swallowed — warmup must never block the run.""" - t0 = time.time() + """Fire warmup pings in background — do NOT block the pipeline. + + Warmup pre-heats HTTP connections and the upstream model so that + Acquire/Appraise/Apply calls (which come after Ask) benefit from warm + connections. Ask itself is already the first real call, so waiting for + warmup to finish before Ask only adds latency without benefit. + """ clients = [ ("agent", get_llm(temperature=0.0, purpose="agent")), ("judge", get_fast_llm(temperature=0.0, purpose="judge")), @@ -46,14 +49,18 @@ def _warmup_llms() -> None: def _ping(name_client): name, client = name_client + t0 = time.time() try: client.invoke("ok") + # Use \r-overwrite so warmup lines don't interrupt streaming output + # from Ask. They're debug-only; only printed if nothing else is printing. except Exception as exc: - print(f"[WARMUP] {name} failed (non-fatal): {exc}") + import sys as _sys + print(f"[WARMUP] {name} failed (non-fatal): {exc}", file=_sys.stderr) - with ThreadPoolExecutor(max_workers=len(clients)) as pool: - list(pool.map(_ping, clients)) - print(f"[TIMING] warmup: {time.time() - t0:.2f}s") + executor = ThreadPoolExecutor(max_workers=len(clients)) + executor.map(_ping, clients) + # Intentionally NOT calling executor.shutdown(wait=True) — we want fire-and-forget. def create_workflow() -> Coordinator: @@ -91,6 +98,76 @@ def create_workflow() -> Coordinator: return coordinator +def _print_stage_result(stage: str, state: Dict[str, Any]) -> None: + """Print a stage's result immediately after it completes.""" + sep = "─" * 60 + if stage == "Ask": + route = state.get("route_type", "") + if route == "direct_answer": + direct = state.get("direct_answer_output") or {} + out_of_domain = state.get("out_of_domain", False) + if out_of_domain: + print(f"\n{sep}") + print(f"[Ask] 问题超出高血压领域") + print(f" {direct.get('answer', '')}") + print(sep) + else: + print(f"\n{sep}") + print(f"[Ask] 直接回答") + print(f" {direct.get('answer', '')}") + print(sep) + else: + pico = state.get("pico_query") + ebm = state.get("ebm_query") + print(f"\n{sep}") + print(f"[Ask] 问题解析完成 (route={route}, type={state.get('question_type','')})") + if ebm: + print(f" Patient : {ebm.patient}") + print(f" Focus : {ebm.primary_focus}") + if ebm.comparator: + print(f" Comparator : {ebm.comparator}") + print(f" Outcome : {ebm.outcome}") + elif pico: + print(f" P : {pico.patient}") + print(f" I : {pico.intervention}") + print(f" C : {pico.comparison}") + print(f" O : {pico.outcome}") + print(sep) + + elif stage == "Acquire": + ev_list = state.get("evidence_list") or [] + query = state.get("search_query", "") + print(f"\n{sep}") + print(f"[Acquire] 检索到 {len(ev_list)} 篇文献 query: {query[:80]}") + for i, e in enumerate(ev_list, 1): + print(f" {i}. [{e.study_type or '?'}] {(e.title or '')[:70]}") + print(f" ID={e.evidence_id or '?'} score={e.relevance_score:.3f} passages={len(e.supporting_passages)}") + print(sep) + + elif stage == "Appraise": + ev_list = (state.get("appraisal_results") and state["appraisal_results"].evidence) or [] + print(f"\n{sep}") + print(f"[Appraise] 证据质量评价完成 ({len(ev_list)} 篇)") + for e in ev_list: + print(f" {e.evidence_id or e.title[:40]:45s} {e.study_type or '?':22s} GRADE={e.grade_level or '?'}") + print(sep) + + elif stage == "Apply": + rec = state.get("recommendation") + if rec: + print(f"\n{sep}") + print(f"[Apply] 推荐生成完成 (strength={rec.strength}, quality={rec.evidence_quality})") + print(f" {rec.text[:200]}") + print(sep) + + elif stage == "Assess": + assess = state.get("assessment") + if assess: + print(f"\n{sep}") + print(f"[Assess] 质量评估完成 score={assess.quality_score:.2f} backtrack={assess.needs_backtrack}") + print(sep) + + def run_clinical_question(question: str) -> Dict[str, Any]: """ Run a clinical question through the complete 5A workflow @@ -102,7 +179,7 @@ def run_clinical_question(question: str) -> Dict[str, Any]: Final workflow state with recommendation """ coordinator = create_workflow() - result = coordinator.execute_workflow(question) + result = coordinator.execute_workflow(question, on_stage_complete=_print_stage_result) return result @@ -142,7 +219,7 @@ def format_output(state: Dict[str, Any]) -> str: output.append(f"EVIDENCE FOUND: {len(state['evidence_list'])} articles") for i, evidence in enumerate(state["evidence_list"][:3], 1): output.append(f" {i}. {evidence.title}") - output.append(f" Source: {evidence.source} (PMID: {evidence.pmid})") + output.append(f" Source: {evidence.source} (ID: {evidence.evidence_id or '?'})") if evidence.grade_level: output.append(f" Quality: {evidence.grade_level}") output.append("") diff --git a/src/state/schema.py b/src/state/schema.py index 1f55e36..be9d8dd 100644 --- a/src/state/schema.py +++ b/src/state/schema.py @@ -1,5 +1,5 @@ from typing import TypedDict, Optional, List, Dict, Any -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime @@ -28,23 +28,47 @@ class EBMQuery: time_horizon: Optional[str] = None # relevant for prognosis questions +@dataclass +class Passage: + """A supporting passage retrieved from a paper. + + Sourced from hypertensiondb /search chunk-level results. Multiple + passages from the same paper are grouped under one Evidence object. + """ + + section: str # e.g. "结果/主要结局" + snippet: str # <= 800 chars + score: float # rerank_score from /search + + @dataclass class Evidence: - """Single piece of evidence""" + """A paper of evidence with its supporting passages. + + Acquire fills evidence_id / title / year / language / tags / + supporting_passages / relevance_score from the hypertensiondb /search + response. + + Appraise fills study_type / grade_level / rob_overall after appraising + the paper. + """ title: str - source: str - pmid: Optional[str] - abstract: str - relevance_score: float + source: str # "hypertensiondb" + relevance_score: float # = max(passage.score) within paper + + # Filled by Acquire from hypertensiondb /search response: + evidence_id: Optional[str] = None # e.g. EV-RCT-2026-PENG-001 + supporting_passages: List[Passage] = field(default_factory=list) + language: str = "" # zh | en | bilingual + tags: List[str] = field(default_factory=list) + year: Optional[int] = None + + # Filled by Appraise: study_type: Optional[str] = None - publication_date: Optional[str] = None - grade_level: Optional[str] = None - pmcid: Optional[str] = None # PMC article ID (local DB only) - full_text: Optional[str] = None # Full text (local DB only, not passed to prompts) - key_sentences: Optional[str] = None # Extracted span(s) relevant to query keywords - has_full_text: bool = False # True when full_text field is populated - pub_types: Optional[List[str]] = None # PubMed publication types (e.g. ["Randomized Controlled Trial"]) + grade_level: Optional[str] = None # very_low | low | moderate | high + rob_overall: Optional[str] = None # low | some_concerns | high + publication_date: Optional[str] = None # legacy; safe to leave None @dataclass @@ -179,10 +203,13 @@ class WorkflowState(TypedDict): remaining_budget: int soft_gate_signals: List[str] question_type: Optional[str] - route_type: Optional[str] # "direct_answer" | "full_pipeline" | "sub_questions" - route_confidence: Optional[float] # 0.0-1.0 confidence in routing decision - direct_answer_output: Optional[Dict[str, Any]] # populated when route_type == "direct_answer" - ebm_query: Optional[EBMQuery] # structured query replacing/extending pico_query - sub_pico_queries: Optional[List[EBMQuery]] # decomposed sub-questions - sub_question_index: Optional[int] # current sub-question being processed (0-based) - sub_question_total: Optional[int] # total number of sub-questions + route_type: Optional[str] + route_confidence: Optional[float] + direct_answer_output: Optional[Dict[str, Any]] + ebm_query: Optional[EBMQuery] + sub_pico_queries: Optional[List[EBMQuery]] + sub_question_index: Optional[int] + sub_question_total: Optional[int] + # NEW: hypertension RAG refactor + out_of_domain: Optional[bool] # True when Ask soft-rejected non-hypertension question + rag_degraded: Optional[List[str]] # degradation tags from /search response diff --git a/src/tools/hypertension_rag_client.py b/src/tools/hypertension_rag_client.py new file mode 100644 index 0000000..333caaf --- /dev/null +++ b/src/tools/hypertension_rag_client.py @@ -0,0 +1,148 @@ +"""HTTP client for the hypertensiondb FastAPI /search endpoint. + +Responsibilities: +- Issue GET /search with query + top_k +- Retry on transient failures (2 retries with exponential backoff) +- Aggregate chunk-level results into paper-level Evidence objects with + supporting_passages +- Raise RAGUnavailable on persistent failure +""" +from __future__ import annotations + +import os +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import Any + +import httpx + +from src.state.schema import Evidence, Passage + + +class RAGUnavailable(Exception): + """Raised when hypertensiondb /search is unreachable after retries.""" + + +@dataclass(frozen=True) +class RAGConfig: + base_url: str + timeout_s: float + top_k: int + max_papers: int + max_passages_per_paper: int + + @classmethod + def from_env(cls) -> "RAGConfig": + return cls( + base_url=os.getenv("HYPERTENSION_API_URL", "http://localhost:8000"), + timeout_s=float(os.getenv("HYPERTENSION_API_TIMEOUT", "10")), + top_k=int(os.getenv("RAG_SEARCH_TOP_K", "15")), + max_papers=int(os.getenv("RAG_MAX_PAPERS", "6")), + max_passages_per_paper=int(os.getenv("RAG_MAX_PASSAGES_PER_PAPER", "3")), + ) + + +def search(query: str, config: RAGConfig | None = None) -> tuple[list[Evidence], list[str]]: + """Search hypertensiondb and return (evidence_list, degraded_flags). + + Args: + query: natural-language Chinese (or English) query string + config: RAGConfig; defaults to RAGConfig.from_env() + + Returns: + evidence_list: up to config.max_papers Evidence objects, each with up to + config.max_passages_per_paper supporting_passages, sorted by max + passage score descending. + degraded_flags: list of degradation tags from the /search response + (e.g. ["dense"], ["rerank"]). Empty list when fully healthy. + + Raises: + RAGUnavailable: after 2 retries + initial attempt all fail. + """ + cfg = config or RAGConfig.from_env() + payload = _request_with_retries(query, cfg) + raw_results: list[dict] = payload.get("results") or [] + degraded: list[str] = payload.get("degraded") or [] + evidence_list = _aggregate(raw_results, cfg) + return evidence_list, degraded + + +def _request_with_retries(query: str, cfg: RAGConfig) -> dict[str, Any]: + url = f"{cfg.base_url.rstrip('/')}/search" + params = {"q": query, "top_k": cfg.top_k} + backoffs = [0.5, 2.0] # 2 retries; total max wait ~2.5s on top of timeouts + + last_exc: Exception | None = None + for attempt in range(len(backoffs) + 1): # 1 initial + 2 retries = 3 attempts + try: + resp = httpx.get(url, params=params, timeout=cfg.timeout_s) + if resp.status_code >= 500: + raise httpx.HTTPStatusError( + f"HTTP {resp.status_code}", request=resp.request, response=resp + ) + resp.raise_for_status() # 4xx -> raises immediately, will NOT retry + return resp.json() + except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPStatusError) as exc: + if isinstance(exc, httpx.HTTPStatusError) and exc.response.status_code < 500: + raise RAGUnavailable(f"hypertensiondb 4xx: {exc}") from exc + last_exc = exc + if attempt < len(backoffs): + time.sleep(backoffs[attempt]) + raise RAGUnavailable(f"hypertensiondb unreachable after retries: {last_exc}") + + +def _aggregate(raw_results: list[dict], cfg: RAGConfig) -> list[Evidence]: + """Aggregate chunk-level /search results into paper-level Evidence objects. + + - Groups chunks by evidence_id. + - Within each group: sorts passages by rerank_score desc, keeps top N. + - Group order: max(rerank_score) within group, desc. + - Truncates to cfg.max_papers. + """ + by_paper: dict[str, list[dict]] = defaultdict(list) + paper_meta: dict[str, dict] = {} + for item in raw_results: + ev_id = item.get("evidence_id") + if not ev_id: + continue + by_paper[ev_id].append(item) + if ev_id not in paper_meta: + paper_meta[ev_id] = item.get("evidence_meta") or {} + + papers: list[Evidence] = [] + for ev_id, chunks in by_paper.items(): + chunks.sort(key=lambda c: c.get("rerank_score", 0.0), reverse=True) + top_chunks = chunks[: cfg.max_passages_per_paper] + passages = [ + Passage( + section=c.get("section") or "", + snippet=c.get("snippet") or "", + score=float(c.get("rerank_score", 0.0)), + ) + for c in top_chunks + ] + meta = paper_meta[ev_id] + title_field = meta.get("title") or {} + title = title_field.get("zh") or title_field.get("en") or "" + max_score = passages[0].score if passages else 0.0 + papers.append( + Evidence( + evidence_id=ev_id, + title=title, + source="hypertensiondb", + year=meta.get("year"), + language=meta.get("language") or "", + tags=list(meta.get("tags") or []), + supporting_passages=passages, + relevance_score=max_score, + # Pre-filled from frontmatter metadata when available; + # None means Appraise will infer from passage text instead. + study_type=meta.get("type") or None, + grade_level=meta.get("grade_level") or None, + rob_overall=meta.get("rob_overall") or None, + ) + ) + + papers.sort(key=lambda e: e.relevance_score, reverse=True) + return papers[: cfg.max_papers] diff --git a/src/tools/local_evidence_db.py b/src/tools/local_evidence_db.py deleted file mode 100644 index cf2aa22..0000000 --- a/src/tools/local_evidence_db.py +++ /dev/null @@ -1,232 +0,0 @@ -"""Local obstetrics evidence database: BM25 + vector search with RRF fusion. - -Usage (from AcquireAgent): - from src.tools.local_evidence_db import search_local - results = search_local(query="preeclampsia treatment magnesium", top_k=20) - -The database must be built first: - python scripts/build_obstetrics_db.py -""" - -import json -import pickle -import re -from pathlib import Path -from typing import List, Optional, Tuple - -from src.state.schema import Evidence - -_DB_DIR = Path(__file__).parent.parent.parent / "data" / "obstetrics_db" -_CHROMA_DIR = Path(__file__).parent.parent.parent / "data" / "obstetrics_chroma" -_BM25_PATH = _DB_DIR / "bm25.pkl" -_ARTICLES_PATH = _DB_DIR / "articles.json" - -_RRF_K = 60 -_CANDIDATE_N = 20 # candidates per retrieval path before RRF - -# Module-level lazy caches to avoid reloading on every call -_bm25_cache = None -_corpus_ids_cache = None -_articles_cache = None -_embed_model_cache = None -_chroma_collection_cache = None - - -def _load_bm25(): - global _bm25_cache, _corpus_ids_cache - if _bm25_cache is None: - with open(_BM25_PATH, "rb") as f: - data = pickle.load(f) - _bm25_cache = data["bm25"] - _corpus_ids_cache = data["corpus_ids"] - return _bm25_cache, _corpus_ids_cache - - -def _load_articles() -> dict: - global _articles_cache - if _articles_cache is None: - with open(_ARTICLES_PATH, "r", encoding="utf-8") as f: - _articles_cache = {a["pmcid"]: a for a in json.load(f)} - return _articles_cache - - -def _load_embed_model(): - global _embed_model_cache - if _embed_model_cache is None: - from sentence_transformers import SentenceTransformer - - _embed_model_cache = SentenceTransformer( - "sentence-transformers/all-MiniLM-L6-v2" - ) - return _embed_model_cache - - -def _load_chroma(): - global _chroma_collection_cache - if _chroma_collection_cache is None: - import chromadb - - client = chromadb.PersistentClient(path=str(_CHROMA_DIR)) - _chroma_collection_cache = client.get_collection("obstetrics_evidence") - return _chroma_collection_cache - - -def _rrf_fuse( - bm25_hits: List[Tuple[str, int]], - vector_hits: List[Tuple[str, int]], - k: int = _RRF_K, -) -> List[Tuple[str, float]]: - """Reciprocal Rank Fusion: rrf_score(d) = Σ 1/(k + rank_i(d))""" - scores: dict = {} - for pmcid, rank in bm25_hits: - scores[pmcid] = scores.get(pmcid, 0.0) + 1.0 / (k + rank) - for pmcid, rank in vector_hits: - scores[pmcid] = scores.get(pmcid, 0.0) + 1.0 / (k + rank) - return sorted(scores.items(), key=lambda x: x[1], reverse=True) - - -def _extract_spans(abstract_text: str, query_keywords: List[str]) -> Optional[str]: - """Extract key sentence spans from an abstract that match query keywords. - - Algorithm: - 1. Split abstract into sentences on . ! ? 。 boundaries. - 2. Score each sentence by count of query_keywords it contains (case-insensitive). - 3. threshold = 1 (at least one keyword match required). - 4. Merge adjacent sentences that both score >= threshold into a single span. - 5. If >= 60% of sentences score >= threshold, return the full abstract as one span. - 6. Return top-3 spans ranked by max sentence score, each capped at 200 chars. - 7. If no sentence scores >= threshold, return None. - """ - if not abstract_text or not query_keywords: - return None - - keywords_lower = [kw.lower() for kw in query_keywords if kw] - sentences = [s.strip() for s in re.split(r"[.!?。]+", abstract_text) if s.strip()] - if not sentences: - return None - - # Score each sentence - scores = [] - for sent in sentences: - sent_lower = sent.lower() - score = sum(1 for kw in keywords_lower if kw in sent_lower) - scores.append(score) - - threshold = 1 - above = sum(1 for s in scores if s >= threshold) - - # If >=60% of sentences match, return full abstract - if above / len(sentences) >= 0.6: - return abstract_text[:400] - - # Merge adjacent sentences that both score >= threshold - spans = [] - i = 0 - while i < len(sentences): - if scores[i] >= threshold: - span_sents = [sentences[i]] - span_max = scores[i] - j = i + 1 - while j < len(sentences) and scores[j] >= threshold: - span_sents.append(sentences[j]) - span_max = max(span_max, scores[j]) - j += 1 - spans.append((" ".join(span_sents), span_max)) - i = j - else: - i += 1 - - if not spans: - return None - - # Sort by max sentence score descending, take top 3, cap each at 200 chars - spans.sort(key=lambda x: x[1], reverse=True) - top = [s[:200] for s, _ in spans[:3]] - return " … ".join(top) - - -def search_local(query: str, top_k: int = 20) -> List[Evidence]: - """Search the local obstetrics evidence database using BM25 + vector RRF. - - Args: - query: Natural language or Boolean search string. - top_k: Maximum number of Evidence objects to return. - - Returns: - List[Evidence] compatible with the existing Acquire pipeline. - Relevance scores are rank-normalized (rank 1 → 1.0, rank top_k → ~0.1). - - Raises: - FileNotFoundError: If the local DB has not been built yet. - """ - if not _BM25_PATH.exists() or not _ARTICLES_PATH.exists(): - raise FileNotFoundError( - "Local obstetrics DB not found. " - "Run: python scripts/build_obstetrics_db.py" - ) - - bm25, corpus_ids = _load_bm25() - articles = _load_articles() - - # BM25 retrieval - tokens = query.lower().split() - bm25_scores = bm25.get_scores(tokens) - bm25_ranked = sorted( - range(len(corpus_ids)), key=lambda i: bm25_scores[i], reverse=True - )[:_CANDIDATE_N] - bm25_hits = [(corpus_ids[i], rank + 1) for rank, i in enumerate(bm25_ranked)] - - # Vector retrieval - vector_hits: List[Tuple[str, int]] = [] - try: - model = _load_embed_model() - collection = _load_chroma() - query_embedding = model.encode([query])[0].tolist() - n_results = min(_CANDIDATE_N, collection.count()) - if n_results > 0: - vector_results = collection.query( - query_embeddings=[query_embedding], - n_results=n_results, - include=["metadatas"], - ) - # Aggregate chunk-level to article-level (keep best rank per article) - pmcid_best_rank: dict = {} - for rank, meta in enumerate(vector_results["metadatas"][0]): - pmcid = meta["pmcid"] - if pmcid not in pmcid_best_rank: - pmcid_best_rank[pmcid] = rank + 1 - vector_hits = list(pmcid_best_rank.items()) - except Exception as e: - print(f"[DEBUG] Vector search failed ({e}), using BM25 only") - - # RRF fusion - fused = _rrf_fuse(bm25_hits, vector_hits) - - # Build Evidence list with rank-normalized relevance scores - results: List[Evidence] = [] - n = min(top_k, len(fused)) - for rank, (pmcid, _score) in enumerate(fused[:top_k]): - a = articles.get(pmcid) - if a is None: - continue - # Linear rank-normalized score: rank 0 → 1.0, rank n-1 → 0.1 - relevance = round(1.0 - (rank / max(n, 1)) * 0.9, 3) if n > 1 else 1.0 - results.append( - Evidence( - title=a.get("title", ""), - source=a.get("journal", "PMC"), - pmid=a.get("pmid"), - abstract=a.get("abstract", ""), - relevance_score=relevance, - study_type=None, # inferred later by AcquireAgent._infer_study_type() - publication_date=a.get("publication_date"), - grade_level=None, - pmcid=pmcid, - full_text=a.get( - "full_text" - ), # used only at retrieval stage; not in prompts - key_sentences=_extract_spans(a.get("abstract", ""), tokens), - ) - ) - - return results diff --git a/src/tools/pubmed_api.py b/src/tools/pubmed_api.py deleted file mode 100644 index 7bd24f8..0000000 --- a/src/tools/pubmed_api.py +++ /dev/null @@ -1,298 +0,0 @@ -import hashlib -import json -import os -import requests -import time -from concurrent.futures import ThreadPoolExecutor -from dataclasses import asdict -from pathlib import Path -from typing import List, Optional -from dotenv import load_dotenv -from src.state.schema import Evidence - -load_dotenv() - -# --------------------------------------------------------------------------- -# Disk-based PubMed results cache -# --------------------------------------------------------------------------- -_CACHE_DIR = Path(__file__).parent.parent.parent / "data" / "cache" -_CACHE_TTL_SECONDS = 86400 # 24 hours - - -def _cache_key(query: str, max_results: int) -> str: - """Stable cache key from query + max_results.""" - return hashlib.sha256(f"{query}||{max_results}".encode()).hexdigest()[:16] - - -def _load_cache(key: str) -> Optional[List[Evidence]]: - """Return cached Evidence list if present and not expired, else None.""" - path = _CACHE_DIR / f"pubmed_{key}.json" - if not path.exists(): - return None - if time.time() - path.stat().st_mtime > _CACHE_TTL_SECONDS: - path.unlink(missing_ok=True) - return None - try: - data = json.loads(path.read_text(encoding="utf-8")) - return [Evidence(**item) for item in data] - except Exception: - return None - - -def _save_cache(key: str, evidence_list: List[Evidence]) -> None: - """Persist Evidence list to disk cache. Write failure is non-fatal.""" - _CACHE_DIR.mkdir(parents=True, exist_ok=True) - path = _CACHE_DIR / f"pubmed_{key}.json" - try: - path.write_text( - json.dumps([asdict(e) for e in evidence_list], ensure_ascii=False), - encoding="utf-8", - ) - except Exception: - pass - - -class PubMedClient: - """Client for PubMed E-utilities API""" - - def __init__(self, email: str = None): - self.email = email or os.getenv("PUBMED_EMAIL", "") - self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" - - def search(self, query: str, max_results: int = 5) -> List[str]: - """Search PubMed and return list of PMIDs""" - url = f"{self.base_url}/esearch.fcgi" - params = { - "db": "pubmed", - "term": query, - "retmax": max_results, - "retmode": "json", - "sort": "relevance", - "email": self.email, - } - - response = requests.get(url, params=params) - response.raise_for_status() - data = response.json() - return data.get("esearchresult", {}).get("idlist", []) - - def fetch_abstracts(self, pmids: List[str]) -> dict: - """Fetch article abstracts for given PMIDs""" - if not pmids: - return {} - - url = f"{self.base_url}/efetch.fcgi" - params = { - "db": "pubmed", - "id": ",".join(pmids), - "retmode": "xml", - "rettype": "abstract", - "email": self.email, - } - - response = requests.get(url, params=params) - response.raise_for_status() - - import xml.etree.ElementTree as ET - - root = ET.fromstring(response.content) - - abstracts = {} - for article in root.findall(".//PubmedArticle"): - pmid_elem = article.find(".//PMID") - abstract_elem = article.find(".//Abstract/AbstractText") - - if pmid_elem is not None: - pmid = pmid_elem.text - abstract = abstract_elem.text if abstract_elem is not None else "" - abstracts[pmid] = abstract - - return abstracts - - def fetch_summaries(self, pmids: List[str]) -> dict: - """Fetch article summaries for given PMIDs""" - if not pmids: - return {} - - url = f"{self.base_url}/esummary.fcgi" - params = { - "db": "pubmed", - "id": ",".join(pmids), - "retmode": "json", - "email": self.email, - } - - response = requests.get(url, params=params) - response.raise_for_status() - return response.json() - - def fetch_pmc_ids(self, pmids: List[str]) -> dict: - """Convert PubMed IDs to PMC IDs via elink. - - Returns a dict mapping pmid -> "PMC<id>" for articles that have a PMC - record. PMIDs with no PMC entry are omitted. Failures return {}. - """ - if not pmids: - return {} - - import xml.etree.ElementTree as ET - - url = f"{self.base_url}/elink.fcgi" - params = { - "dbfrom": "pubmed", - "db": "pmc", - "id": ",".join(pmids), - "retmode": "xml", - "email": self.email, - } - try: - response = requests.get(url, params=params, timeout=15) - response.raise_for_status() - root = ET.fromstring(response.content) - except Exception: - return {} - - pmid_to_pmcid: dict = {} - for link_set in root.findall(".//LinkSet"): - pmid_elem = link_set.find(".//IdList/Id") - if pmid_elem is None: - continue - pmid = pmid_elem.text - for link_set_db in link_set.findall(".//LinkSetDb"): - if link_set_db.findtext("DbTo", "") != "pmc": - continue - pmc_id_elem = link_set_db.find(".//Link/Id") - if pmc_id_elem is not None: - pmid_to_pmcid[pmid] = f"PMC{pmc_id_elem.text}" - break # take the first PMC link only - return pmid_to_pmcid - - def fetch_pmc_full_text(self, pmcid: str) -> Optional[str]: - """Fetch full article text from PubMed Central. - - Args: - pmcid: PMC ID string, e.g. "PMC1234567" or bare "1234567". - - Returns: - Extracted plain-text body joined by double newlines, or None if - the article is not available in PMC open-access XML. - """ - import xml.etree.ElementTree as ET - - # efetch wants the numeric ID only — strip any "PMC" prefix - numeric_id = pmcid.lstrip("PMCpmc") - - url = f"{self.base_url}/efetch.fcgi" - params = { - "db": "pmc", - "id": numeric_id, - "retmode": "xml", - "email": self.email, - } - try: - response = requests.get(url, params=params, timeout=30) - response.raise_for_status() - root = ET.fromstring(response.content) - except Exception: - return None - - # PMC XML: <body> → <sec> → <p>; collect all <p> text nodes - paragraphs: List[str] = [] - for elem in root.iter("p"): - text = "".join(elem.itertext()).strip() - if text: - paragraphs.append(text) - - return "\n\n".join(paragraphs) if paragraphs else None - - -def search_pubmed( - query: str, max_results: int = 5, email: str = None -) -> List[Evidence]: - """Search PubMed and return Evidence objects. - - Results are cached on disk for 24 hours so that repeated identical queries - (e.g. Acquire retries within the same run, or repeated test runs) skip the - 3-request network round-trip entirely. - """ - key = _cache_key(query, max_results) - cached = _load_cache(key) - if cached is not None: - print( - f"[CACHE HIT] PubMed cache — skipping network fetch ({len(cached)} articles)" - ) - return cached - - client = PubMedClient(email=email) - pmids = client.search(query, max_results) - - if not pmids: - return [] - - # Fetch summaries, abstracts, and PMC ID mapping in parallel - with ThreadPoolExecutor(max_workers=3) as executor: - fut_summaries = executor.submit(client.fetch_summaries, pmids) - fut_abstracts = executor.submit(client.fetch_abstracts, pmids) - fut_pmc_ids = executor.submit(client.fetch_pmc_ids, pmids) - summaries = fut_summaries.result() - abstracts = fut_abstracts.result() - pmc_ids = fut_pmc_ids.result() # {pmid: "PMC<id>"} for open-access articles - - evidence_list = [] - - for pmid in pmids: - article = summaries.get("result", {}).get(pmid, {}) - if not article: - continue - - pub_date = article.get("pubdate", "") - if not pub_date and "epubdate" in article: - pub_date = article.get("epubdate", "") - - abstract = abstracts.get(pmid, "") - pmcid = pmc_ids.get(pmid) # None if not in PMC open-access - - # Extract publication types from esummary pubtype list. - # Each entry is either a plain string or a dict with a "value" key - # depending on the API version — handle both forms. - raw_pubtypes = article.get("pubtype", []) - pub_types = [] - for pt in raw_pubtypes: - if isinstance(pt, str): - pub_types.append(pt) - elif isinstance(pt, dict): - v = pt.get("value") or pt.get("name") or "" - if v: - pub_types.append(v) - - evidence = Evidence( - title=article.get("title", "No title"), - source=article.get("source", "PubMed"), - pmid=pmid, - abstract=abstract, - relevance_score=1.0, - study_type=None, - publication_date=pub_date, - grade_level=None, - pmcid=pmcid, - has_full_text=pmcid is not None, - pub_types=pub_types or None, - ) - evidence_list.append(evidence) - - _save_cache(key, evidence_list) - return evidence_list - - -def fetch_pmc_full_text(pmid: str, email: str = None) -> Optional[str]: - """Convenience wrapper: fetch PMC full text for a single PubMed article. - - Looks up the PMC ID for *pmid* first, then fetches the full article body. - Returns None if the article has no PMC open-access record or on any error. - """ - client = PubMedClient(email=email) - pmc_ids = client.fetch_pmc_ids([pmid]) - pmcid = pmc_ids.get(pmid) - if not pmcid: - return None - return client.fetch_pmc_full_text(pmcid) diff --git a/tests/agents/test_acquire_agent.py b/tests/agents/test_acquire_agent.py deleted file mode 100644 index 47ad138..0000000 --- a/tests/agents/test_acquire_agent.py +++ /dev/null @@ -1,48 +0,0 @@ -import pytest -from unittest.mock import Mock, patch -from src.agents.acquire_agent import AcquireAgent -from src.state.schema import WorkflowState, PICOQuery, Evidence - -@pytest.fixture -def mock_llm(): - llm = Mock() - llm.invoke = Mock(return_value=Mock( - content="aspirin AND primary prevention" - )) - return llm - -@pytest.fixture -def sample_state(): - return WorkflowState( - original_question="Should I prescribe aspirin?", - current_step="acquire", - iteration_count=1, - agent_call_counts={}, - execution_history=[], - pico_query=PICOQuery( - patient="60yo male", - intervention="aspirin", - comparison="placebo", - outcome="cardiovascular events", - keywords=["aspirin", "primary prevention"] - ) - ) - -@patch('src.agents.acquire_agent.search_pubmed') -def test_acquire_agent_execute_returns_evidence(mock_search, mock_llm, sample_state): - """Test that AcquireAgent returns evidence list""" - mock_search.return_value = [ - Evidence( - title="Aspirin study", - source="JAMA", - pmid="12345", - abstract="Study on aspirin", - relevance_score=0.9 - ) - ] - - agent = AcquireAgent(llm=mock_llm, tools=[]) - result = agent.execute(sample_state) - - assert "evidence_list" in result - assert len(result["evidence_list"]) > 0 diff --git a/tests/agents/test_apply_agent.py b/tests/agents/test_apply_agent.py deleted file mode 100644 index e52ccdc..0000000 --- a/tests/agents/test_apply_agent.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest -from unittest.mock import Mock -from src.agents.apply_agent import ApplyAgent -from src.state.schema import WorkflowState, Evidence, AppraisalResults, Recommendation - -@pytest.fixture -def mock_llm(): - llm = Mock() - llm.invoke = Mock(return_value=Mock( - content='{"recommendation": "Consider aspirin with caution", "strength": "Weak", "rationale": "Moderate evidence with bleeding risk", "caveats": ["Monitor for bleeding"]}' - )) - return llm - -@pytest.fixture -def sample_state(): - return WorkflowState( - original_question="Should I prescribe aspirin?", - current_step="apply", - iteration_count=1, - agent_call_counts={}, - execution_history=[], - appraisal_results=AppraisalResults( - evidence=[ - Evidence( - title="Study 1", - source="JAMA", - pmid="123", - abstract="RCT on aspirin", - relevance_score=0.9, - grade_level="Moderate" - ) - ], - has_conflict=False, - conflict_description=None, - summary="Moderate quality evidence" - ) - ) - -def test_apply_agent_execute_returns_recommendation(mock_llm, sample_state): - """Test that ApplyAgent returns Recommendation""" - agent = ApplyAgent(llm=mock_llm, tools=[]) - result = agent.execute(sample_state) - - assert "recommendation" in result - assert isinstance(result["recommendation"], Recommendation) - assert result["recommendation"].strength in ["Strong", "Weak"] diff --git a/tests/agents/test_appraise_agent.py b/tests/agents/test_appraise_agent.py deleted file mode 100644 index 5cb2e35..0000000 --- a/tests/agents/test_appraise_agent.py +++ /dev/null @@ -1,40 +0,0 @@ -import pytest -from unittest.mock import Mock -from src.agents.appraise_agent import AppraiseAgent -from src.state.schema import WorkflowState, Evidence, AppraisalResults - -@pytest.fixture -def mock_llm(): - llm = Mock() - llm.invoke = Mock(return_value=Mock( - content='{"grades": ["Moderate"], "has_conflict": false, "summary": "Good quality evidence"}' - )) - return llm - -@pytest.fixture -def sample_state(): - return WorkflowState( - original_question="Should I prescribe aspirin?", - current_step="appraise", - iteration_count=1, - agent_call_counts={}, - execution_history=[], - evidence_list=[ - Evidence( - title="Study 1", - source="JAMA", - pmid="123", - abstract="RCT on aspirin", - relevance_score=0.9 - ) - ] - ) - -def test_appraise_agent_execute_returns_appraisal(mock_llm, sample_state): - """Test that AppraiseAgent returns AppraisalResults""" - agent = AppraiseAgent(llm=mock_llm, tools=[]) - result = agent.execute(sample_state) - - assert "appraisal_results" in result - assert isinstance(result["appraisal_results"], AppraisalResults) - assert result["appraisal_results"].evidence[0].grade_level == "Moderate" diff --git a/tests/agents/test_base.py b/tests/agents/test_base.py deleted file mode 100644 index 6ca450b..0000000 --- a/tests/agents/test_base.py +++ /dev/null @@ -1,26 +0,0 @@ -import pytest -from unittest.mock import Mock -from src.agents.base import BaseAgent -from src.state.schema import WorkflowState - -def test_base_agent_initialization(): - """Test BaseAgent can be initialized""" - llm = Mock() - agent = BaseAgent(llm=llm, tools=[], agent_type="Test") - assert agent.llm == llm - assert agent.agent_type == "Test" - -def test_base_agent_execute_not_implemented(): - """Test that execute method must be implemented by subclasses""" - llm = Mock() - agent = BaseAgent(llm=llm, tools=[], agent_type="Test") - state = WorkflowState( - original_question="test", - current_step="test", - iteration_count=0, - agent_call_counts={}, - execution_history=[] - ) - - with pytest.raises(NotImplementedError): - agent.execute(state)