liningping · liningping · Mar 15, 2026 · Mar 15, 2026 · Mar 16, 2026
diff --git a/benchmarks/deep_research_bench/data/test_data/cleaned_data/edr_qwen3-max_0316.jsonl b/benchmarks/deep_research_bench/data/test_data/cleaned_data/edr_qwen3-max_0316.jsonl
diff --git a/benchmarks/deep_research_bench/data/test_data/cleaned_data/edr_qwen3-max_wo_RAGdenoise.jsonl b/benchmarks/deep_research_bench/data/test_data/cleaned_data/edr_qwen3-max_wo_RAGdenoise.jsonl
diff --git a/benchmarks/deep_research_bench/data/test_data/cleaned_data/edr_qwen3-max_wo_verify.jsonl b/benchmarks/deep_research_bench/data/test_data/cleaned_data/edr_qwen3-max_wo_verify.jsonl
diff --git a/benchmarks/deep_research_bench/data/test_data/raw_data/edr_qwen3-max_0316.jsonl b/benchmarks/deep_research_bench/data/test_data/raw_data/edr_qwen3-max_0316.jsonl
diff --git a/benchmarks/deep_research_bench/data/test_data/raw_data/edr_qwen3-max_wo_RAGdenoise.jsonl b/benchmarks/deep_research_bench/data/test_data/raw_data/edr_qwen3-max_wo_RAGdenoise.jsonl
diff --git a/benchmarks/deep_research_bench/data/test_data/raw_data/edr_qwen3-max_wo_verify.jsonl b/benchmarks/deep_research_bench/data/test_data/raw_data/edr_qwen3-max_wo_verify.jsonl
diff --git a/benchmarks/deep_research_bench/results/race/edr_gemini/race_result.txt b/benchmarks/deep_research_bench/results/race/edr_gemini/race_result.txt
diff --git a/benchmarks/deep_research_bench/results/race/edr_qwen3-max_0316/race_result.txt b/benchmarks/deep_research_bench/results/race/edr_qwen3-max_0316/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.4993
+Insight: 0.5146
+Instruction Following: 0.5119
+Readability: 0.5044
+Overall Score: 0.5081
diff --git a/...results/race/edr_gemini/raw_results.jsonl → ...race/edr_qwen3-max_0316/raw_results.jsonl b/...results/race/edr_gemini/raw_results.jsonl → ...race/edr_qwen3-max_0316/raw_results.jsonl
diff --git a/benchmarks/deep_research_bench/results/race/edr_qwen3-max_wo_RAGdenoise/race_result.txt b/benchmarks/deep_research_bench/results/race/edr_qwen3-max_wo_RAGdenoise/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.4982
+Insight: 0.5128
+Instruction Following: 0.5119
+Readability: 0.5027
+Overall Score: 0.5073
diff --git a/...ults/race/edr_gemini/ranked_results.jsonl → ...qwen3-max_wo_RAGdenoise/raw_results.jsonl b/...ults/race/edr_gemini/ranked_results.jsonl → ...qwen3-max_wo_RAGdenoise/raw_results.jsonl
diff --git a/benchmarks/deep_research_bench/results/race/edr_qwen3-max_wo_verify/race_result.txt b/benchmarks/deep_research_bench/results/race/edr_qwen3-max_wo_verify/race_result.txt
@@ -0,0 +1,5 @@
+Comprehensiveness: 0.4883
+Insight: 0.4965
+Instruction Following: 0.5061
+Readability: 0.4974
+Overall Score: 0.4968
diff --git a/benchmarks/deep_research_bench/results/race/edr_qwen3-max_wo_verify/raw_results.jsonl b/benchmarks/deep_research_bench/results/race/edr_qwen3-max_wo_verify/raw_results.jsonl
diff --git a/benchmarks/deep_research_bench/run_benchmark.sh b/benchmarks/deep_research_bench/run_benchmark.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Target model name list
-TARGET_MODELS=("edr_qwen3-max")
+TARGET_MODELS=("edr_qwen3-max_0316")
 # Common parameters for both RACE and Citation evaluations
 RAW_DATA_DIR="data/test_data/raw_data"
 OUTPUT_DIR="results"

diff --git a/benchmarks/run_research.sh b/benchmarks/run_research.sh
@@ -21,24 +21,25 @@ mkdir -p $LOGS_DIR
 # --output_dir drb_steer_trajectories \
 # --max_concurrent 1 \
 # Define the ablation settings to test:
-# format: "ENABLE_VERIFICATION BASIC_REPORT_DENOISING"
+# format: "ENABLE_VERIFICATION DISABLE_REPORT_DENOISING"
 ABLATIONS=(
-  "true true"     # No RAGdenoise: Verification ON, RAGdenoise OFF
-  "false false"   # No Verify: Verification OFF, RAGdenoise ON
+    "false false"   # full agent
+  # "false true"    # No denoise
+  # "true false"    # No Verify
 )
 
 for ablation in "${ABLATIONS[@]}"; do
   read -r VERIFY DENOISE <<< "$ablation"
 
-  export ENABLE_VERIFICATION="$VERIFY"
-  export BASIC_REPORT_DENOISING="$DENOISE"
+  export DISABLE_VERIFICATION="$VERIFY"
+  export DISABLE_REPORT_DENOISING="$DENOISE"
 
   ABLATION_SUFFIX=""
-  if [[ "$VERIFY" == "false" ]]; then
+  if [[ "$VERIFY" == "true" ]]; then
     ABLATION_SUFFIX="${ABLATION_SUFFIX}_wo_verify"
   fi
   if [[ "$DENOISE" == "true" ]]; then
-    ABLATION_SUFFIX="${ABLATION_SUFFIX}_wo_RAGdenoise"
+    ABLATION_SUFFIX="${ABLATION_SUFFIX}_wo_denoise"
   fi
 
   echo "Starting run for ablation: VERIFY=$VERIFY, DENOISE=$DENOISE, SUFFIX=$ABLATION_SUFFIX"
@@ -82,4 +83,4 @@ done
 #     --collect-traj > $LOGS_DIR/healthbench_traj_all.log 2>&1 &
 
 # python process_healthbench.py  --input-dir healthbench_trajectories
-# python evaluate_healthbench.py healthbench_results/edr_healthbench_final_run_100.jsonl --grader-model gpt-4.1-2025-04-14
+# python evaluate_healthbench.py healthbench_results/edr_healthbench_final_run_100.jsonl --grader-model gpt-4.1-2025-04-14
diff --git a/deep_research/multi_agent_supervisor.py b/deep_research/multi_agent_supervisor.py
@@ -61,7 +61,7 @@ def get_notes_from_tool_calls(messages: list[BaseMessage]) -> list[str]:
     tool_msgs = filter_messages(messages, include_types="tool")
     for msg in tool_msgs:
         if msg.name == "ConductResearch":
-            if os.getenv("ENABLE_VERIFICATION", "true").lower() == "true":
+            if not (os.getenv("DISABLE_VERIFICATION", "false").lower() == "true"):
                 if "FAIL:" not in str(msg.content):
                     valid_notes.append(str(msg.content))
             else:
@@ -135,7 +135,7 @@ async def supervisor(state: SupervisorState) -> Command[Literal["supervisor_tool
 
     # Prepare system message with current date and constraints
 
-    if os.getenv("ENABLE_VERIFICATION", "true").lower() == "true":
+    if not (os.getenv("DISABLE_VERIFICATION", "false").lower() == "true"):
         system_message = lead_researcher_with_multiple_steps_diffusion_double_check_prompt.format(
             date=get_today_str(), 
             max_concurrent_research_units=max_concurrent_researchers,
@@ -256,7 +256,7 @@ async def supervisor_tools(state: SupervisorState) -> Command[Literal["superviso
                 for result, tool_call in zip(tool_results, conduct_research_calls):
                     raw_findings = result.get("compressed_research", "Error synthesizing research report")
 
-                    if os.getenv("ENABLE_VERIFICATION", "true").lower() == "true":
+                    if not (os.getenv("DISABLE_VERIFICATION", "false").lower() == "true"):
                         assertions = tool_call["args"].get("verification_assertions", [])
 
                         if not assertions:

diff --git a/deep_research/prompts.py b/deep_research/prompts.py
@@ -382,13 +382,9 @@
 
 The cleaned findings will be used for final report generation, so comprehensiveness is critical."""
 
-final_report_generation_with_helpfulness_insightfulness_hit_citation_prompt = """
+final_report_generation_step1_prompt = """
 You are an expert Deep Research Analyst and meticulous Fact-Checker. 
-Your task is to synthesize research findings, resolve conflicts, and produce a highly insightful, publication-ready report based on a research brief and an initial draft.
-
-CRITICAL: The final report in Step 4 MUST be written in the same language as the human messages! 
-For example, if the user's messages are in English, then MAKE SURE you write your response in English. If the user's messages are in Chinese, then MAKE SURE you write your entire response in Chinese.
-This is critical. The user will only understand the answer if it is written in the same language as their input message.
+Your task is to analyze research findings against a research brief and an initial draft, extracting evidence and synthesizing a plan for the final report.
 
 Today's date is {date}.
 
@@ -405,7 +401,7 @@
 </Findings>
 
 === EXECUTION INSTRUCTIONS ===
-You must process this request in a single response, strictly following these 4 sequential steps. 
+You must process this request in a single response, strictly following these 3 sequential steps. 
 
 ### Step 1: Evidence Anchoring (Tagging)
 Scan the <Findings> and identify specific sentences, paragraphs, or data points that are highly relevant to answering the <Research Brief> or addressing points in the <Draft Report>. 
@@ -424,60 +420,14 @@
 - Think step-by-step about how to construct the final report. 
 - Address any [REFUTE] facts: How will you correct the draft?
 - Address [SUPPLEMENT] facts: Where is the best place to insert this new information?
-- Plan the overall structure of your final report based on the structural examples provided in Step 4.
-
-### Step 4: Final Report Generation
-Now, write the final detailed answer to the overall research brief based on your reasoning.
-
-[Structure Guidance]
-You can structure your report in a number of different ways. Here are some examples:
-- To compare two things: 1/ intro 2/ overview of topic A 3/ overview of topic B 4/ comparison between A and B 5/ conclusion
-- To return a list: 1/ list of things or table of things (Or make each item a separate section. No intro/conclusion needed for lists).
-- To summarize/overview: 1/ overview of topic 2/ concept 1 3/ concept 2 4/ concept 3 5/ conclusion
-REMEMBER: Section is a VERY fluid and loose concept. You can structure your report however you think is best. Make sure sections are cohesive and make sense for the reader.
-
-[Writing & Formatting Rules]
-For each section of the report, do the following:
-- Have an explicit discussion in simple, clear language. DO NOT oversimplify. Clarify when a concept is ambiguous.
-- DO NOT list facts in bullet points. Write in paragraph form.
-- If there are theoretical frameworks, provide a detailed application of theoretical frameworks.
-- For comparison and conclusion, include a summary table.
-- Use ## for section title (Markdown format) for each section. (# for title, ### for subsections).
-- Do NOT ever refer to yourself as the writer of the report. No self-referential language.
-- Do not say what you are doing in the report. Just write it.
-- Each section should be fairly long and verbose. You are writing a deep research report, and users expect a thorough answer.
-
-[Quality Check Rules]
-Ensure your final report strictly adheres to these rules:
-<Insightfulness Rules>
-- Granular breakdown - Does the response have a granular breakdown of the topics and their specific causes and specific impacts?
-- Detailed mapping table - Does the response have a detailed table mapping these causes and effects?
-- Nuanced discussion - Does the response have detailed exploration of the topic and explicit discussion?
-</Insightfulness Rules>
-<Helpfulness Rules>
-- Satisfying user intent – Does the response directly address the user’s request or question?
-- Ease of understanding – Is the response fluent, coherent, and logically structured?
-- Accuracy – Are the facts, reasoning, and explanations correct?
-- Appropriate language – Is the tone suitable and professional, without unnecessary jargon or confusing phrasing?
-</Helpfulness Rules>
-
-[Citation Rules]
-- Assign each unique URL a single citation number in your text.
-- End with ### Sources that lists each source with corresponding numbers.
-- Include the URL in the ### Sources section only. Use the citation number in the other sections.
-- IMPORTANT: Number sources sequentially without gaps (1,2,3,4...) in the final list.
-- Each source should be a separate line item.
-- Example format:
-  [1] Source Title: URL
-  [2] Source Title: URL
-- Citations are extremely important. Pay a lot of attention to getting these right.
+- Plan the overall structure of the final report.
 
 === BEGIN YOUR RESPONSE ===
 """
 
-final_report_generation_basic_denoise_prompt = """
-You are an expert Deep Research Analyst and meticulous Fact-Checker. 
-Your task is to synthesize research findings, resolve conflicts, and produce a highly insightful, publication-ready report based on a research brief and an initial draft.
+final_report_generation_step2_prompt = """
+You are an expert Deep Research Analyst and meticulous Report Writer. 
+Your task is to write a highly insightful, publication-ready final report based on a research brief, an initial draft, and research findings or synthesis.
 
 CRITICAL: The final report MUST be written in the same language as the human messages! 
 For example, if the user's messages are in English, then MAKE SURE you write your response in English. If the user's messages are in Chinese, then MAKE SURE you write your entire response in Chinese.
@@ -493,22 +443,12 @@
 {draft_report}
 </Draft Report>
 
-<Findings>
-{findings}
-</Findings>
+<Fact-Checking Synthesis or Raw Findings>
+{synthesis_or_findings}
+</Fact-Checking Synthesis or Raw Findings>
 
 === EXECUTION INSTRUCTIONS ===
-You must process this request in a single response, strictly following these steps to denoise the draft report using findings:
-
-### Step 1: Synthesis & Conflict Resolution (Chain-of-Thought)
-Wrap your thinking process in `<reasoning>` tags.
-- Think step-by-step about how to update the draft report and integrate the newly discovered findings. 
-- Address any contradictions: How will you correct the draft based on the findings?
-- Address new information: Where is the best place to insert this new information?
-- Plan the overall structure of your final report based on the structural examples provided below.
-
-### Step 2: Final Report Generation
-Now, write the final detailed answer to the overall research brief based on your reasoning.
+Based on the <Fact-Checking Synthesis or Raw Findings> provided above, write the final detailed answer to the overall research brief.
 
 [Structure Guidance]
 You can structure your report in a number of different ways. Here are some examples:

diff --git a/deep_research/research_agent_full.py b/deep_research/research_agent_full.py
@@ -17,8 +17,8 @@
 
 from deep_research.utils import get_today_str
 from deep_research.prompts import (
-    final_report_generation_with_helpfulness_insightfulness_hit_citation_prompt,
-    final_report_generation_basic_denoise_prompt
+    final_report_generation_step1_prompt,
+    final_report_generation_step2_prompt
 )
 from deep_research.state_scope import AgentState, AgentInputState
 from deep_research.research_agent_scope import clarify_with_user, write_research_brief, write_draft_report
@@ -52,23 +52,34 @@ async def final_report_generation(state: AgentState):
 
     findings = "\n".join(notes)
 
-    if os.getenv("BASIC_REPORT_DENOISING", "false").lower() == "true":
-        final_report_prompt = final_report_generation_basic_denoise_prompt.format(
+    if os.getenv("DISABLE_REPORT_DENOISING", "false").lower() == "true":
+        # Ablation baseline: Directly pass findings without Synthesis (Step 1)
+        writer_prompt = final_report_generation_step2_prompt.format(
             research_brief=state.get("research_brief", ""),
-            findings=findings,
+            synthesis_or_findings=findings,
             date=get_today_str(),
             draft_report=state.get("draft_report", "")
         )
+        final_report = await writer_model.ainvoke([HumanMessage(content=writer_prompt)])
     else:
-        final_report_prompt = final_report_generation_with_helpfulness_insightfulness_hit_citation_prompt.format(
+        # Phase 1: Analysis & Synthesis (Evidence Extraction & Conflict Resolution)
+        step1_prompt = final_report_generation_step1_prompt.format(
             research_brief=state.get("research_brief", ""),
             findings=findings,
             date=get_today_str(),
-            draft_report=state.get("draft_report", ""),
-            user_request=state.get("user_request", "")
+            draft_report=state.get("draft_report", "")
         )
-
-    final_report = await writer_model.ainvoke([HumanMessage(content=final_report_prompt)])
+        step1_response = await writer_model.ainvoke([HumanMessage(content=step1_prompt)])
+        synthesis = step1_response.content
+
+        # Phase 2: Final Report Generation
+        step2_prompt = final_report_generation_step2_prompt.format(
+            research_brief=state.get("research_brief", ""),
+            synthesis_or_findings=synthesis,
+            date=get_today_str(),
+            draft_report=state.get("draft_report", "")
+        )
+        final_report = await writer_model.ainvoke([HumanMessage(content=step2_prompt)])
 
     return {
         "final_report": final_report.content, 

diff --git a/deep_research/state_multi_agent_supervisor.py b/deep_research/state_multi_agent_supervisor.py
@@ -15,7 +15,7 @@
 from pydantic import BaseModel, Field
 import os
 
-ENABLE_VERIFICATION = os.getenv("ENABLE_VERIFICATION", "true").lower() == "true"
+DISABLE_VERIFICATION = os.getenv("DISABLE_VERIFICATION", "false").lower() == "true"
 
 class SupervisorState(TypedDict):
     """
@@ -38,7 +38,7 @@ class SupervisorState(TypedDict):
     # Draft report
     draft_report: str
 
-if ENABLE_VERIFICATION:
+if not DISABLE_VERIFICATION:
     @tool
     class ConductResearch(BaseModel):
         """Tool for delegating a research task to a specialized sub-agent."""