Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Comprehensiveness: 0.4993
Insight: 0.5146
Instruction Following: 0.5119
Readability: 0.5044
Overall Score: 0.5081

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Comprehensiveness: 0.4982
Insight: 0.5128
Instruction Following: 0.5119
Readability: 0.5027
Overall Score: 0.5073

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Comprehensiveness: 0.4883
Insight: 0.4965
Instruction Following: 0.5061
Readability: 0.4974
Overall Score: 0.4968

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion benchmarks/deep_research_bench/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
# Target model name list
TARGET_MODELS=("edr_qwen3-max")
TARGET_MODELS=("edr_qwen3-max_0316")
# Common parameters for both RACE and Citation evaluations
RAW_DATA_DIR="data/test_data/raw_data"
OUTPUT_DIR="results"
Expand Down
17 changes: 9 additions & 8 deletions benchmarks/run_research.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,25 @@ mkdir -p $LOGS_DIR
# --output_dir drb_steer_trajectories \
# --max_concurrent 1 \
# Define the ablation settings to test:
# format: "ENABLE_VERIFICATION BASIC_REPORT_DENOISING"
# format: "ENABLE_VERIFICATION DISABLE_REPORT_DENOISING"
ABLATIONS=(
"true true" # No RAGdenoise: Verification ON, RAGdenoise OFF
"false false" # No Verify: Verification OFF, RAGdenoise ON
"false false" # full agent
# "false true" # No denoise
# "true false" # No Verify
)

for ablation in "${ABLATIONS[@]}"; do
read -r VERIFY DENOISE <<< "$ablation"

export ENABLE_VERIFICATION="$VERIFY"
export BASIC_REPORT_DENOISING="$DENOISE"
export DISABLE_VERIFICATION="$VERIFY"
export DISABLE_REPORT_DENOISING="$DENOISE"

ABLATION_SUFFIX=""
if [[ "$VERIFY" == "false" ]]; then
if [[ "$VERIFY" == "true" ]]; then
ABLATION_SUFFIX="${ABLATION_SUFFIX}_wo_verify"
fi
if [[ "$DENOISE" == "true" ]]; then
ABLATION_SUFFIX="${ABLATION_SUFFIX}_wo_RAGdenoise"
ABLATION_SUFFIX="${ABLATION_SUFFIX}_wo_denoise"
fi

echo "Starting run for ablation: VERIFY=$VERIFY, DENOISE=$DENOISE, SUFFIX=$ABLATION_SUFFIX"
Expand Down Expand Up @@ -82,4 +83,4 @@ done
# --collect-traj > $LOGS_DIR/healthbench_traj_all.log 2>&1 &

# python process_healthbench.py --input-dir healthbench_trajectories
# python evaluate_healthbench.py healthbench_results/edr_healthbench_final_run_100.jsonl --grader-model gpt-4.1-2025-04-14
# python evaluate_healthbench.py healthbench_results/edr_healthbench_final_run_100.jsonl --grader-model gpt-4.1-2025-04-14
6 changes: 3 additions & 3 deletions deep_research/multi_agent_supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_notes_from_tool_calls(messages: list[BaseMessage]) -> list[str]:
tool_msgs = filter_messages(messages, include_types="tool")
for msg in tool_msgs:
if msg.name == "ConductResearch":
if os.getenv("ENABLE_VERIFICATION", "true").lower() == "true":
if not (os.getenv("DISABLE_VERIFICATION", "false").lower() == "true"):
if "FAIL:" not in str(msg.content):
valid_notes.append(str(msg.content))
else:
Expand Down Expand Up @@ -135,7 +135,7 @@ async def supervisor(state: SupervisorState) -> Command[Literal["supervisor_tool

# Prepare system message with current date and constraints

if os.getenv("ENABLE_VERIFICATION", "true").lower() == "true":
if not (os.getenv("DISABLE_VERIFICATION", "false").lower() == "true"):
system_message = lead_researcher_with_multiple_steps_diffusion_double_check_prompt.format(
date=get_today_str(),
max_concurrent_research_units=max_concurrent_researchers,
Expand Down Expand Up @@ -256,7 +256,7 @@ async def supervisor_tools(state: SupervisorState) -> Command[Literal["superviso
for result, tool_call in zip(tool_results, conduct_research_calls):
raw_findings = result.get("compressed_research", "Error synthesizing research report")

if os.getenv("ENABLE_VERIFICATION", "true").lower() == "true":
if not (os.getenv("DISABLE_VERIFICATION", "false").lower() == "true"):
assertions = tool_call["args"].get("verification_assertions", [])

if not assertions:
Expand Down
82 changes: 11 additions & 71 deletions deep_research/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,13 +382,9 @@

The cleaned findings will be used for final report generation, so comprehensiveness is critical."""

final_report_generation_with_helpfulness_insightfulness_hit_citation_prompt = """
final_report_generation_step1_prompt = """
You are an expert Deep Research Analyst and meticulous Fact-Checker.
Your task is to synthesize research findings, resolve conflicts, and produce a highly insightful, publication-ready report based on a research brief and an initial draft.

CRITICAL: The final report in Step 4 MUST be written in the same language as the human messages!
For example, if the user's messages are in English, then MAKE SURE you write your response in English. If the user's messages are in Chinese, then MAKE SURE you write your entire response in Chinese.
This is critical. The user will only understand the answer if it is written in the same language as their input message.
Your task is to analyze research findings against a research brief and an initial draft, extracting evidence and synthesizing a plan for the final report.

Today's date is {date}.

Expand All @@ -405,7 +401,7 @@
</Findings>

=== EXECUTION INSTRUCTIONS ===
You must process this request in a single response, strictly following these 4 sequential steps.
You must process this request in a single response, strictly following these 3 sequential steps.

### Step 1: Evidence Anchoring (Tagging)
Scan the <Findings> and identify specific sentences, paragraphs, or data points that are highly relevant to answering the <Research Brief> or addressing points in the <Draft Report>.
Expand All @@ -424,60 +420,14 @@
- Think step-by-step about how to construct the final report.
- Address any [REFUTE] facts: How will you correct the draft?
- Address [SUPPLEMENT] facts: Where is the best place to insert this new information?
- Plan the overall structure of your final report based on the structural examples provided in Step 4.

### Step 4: Final Report Generation
Now, write the final detailed answer to the overall research brief based on your reasoning.

[Structure Guidance]
You can structure your report in a number of different ways. Here are some examples:
- To compare two things: 1/ intro 2/ overview of topic A 3/ overview of topic B 4/ comparison between A and B 5/ conclusion
- To return a list: 1/ list of things or table of things (Or make each item a separate section. No intro/conclusion needed for lists).
- To summarize/overview: 1/ overview of topic 2/ concept 1 3/ concept 2 4/ concept 3 5/ conclusion
REMEMBER: Section is a VERY fluid and loose concept. You can structure your report however you think is best. Make sure sections are cohesive and make sense for the reader.

[Writing & Formatting Rules]
For each section of the report, do the following:
- Have an explicit discussion in simple, clear language. DO NOT oversimplify. Clarify when a concept is ambiguous.
- DO NOT list facts in bullet points. Write in paragraph form.
- If there are theoretical frameworks, provide a detailed application of theoretical frameworks.
- For comparison and conclusion, include a summary table.
- Use ## for section title (Markdown format) for each section. (# for title, ### for subsections).
- Do NOT ever refer to yourself as the writer of the report. No self-referential language.
- Do not say what you are doing in the report. Just write it.
- Each section should be fairly long and verbose. You are writing a deep research report, and users expect a thorough answer.

[Quality Check Rules]
Ensure your final report strictly adheres to these rules:
<Insightfulness Rules>
- Granular breakdown - Does the response have a granular breakdown of the topics and their specific causes and specific impacts?
- Detailed mapping table - Does the response have a detailed table mapping these causes and effects?
- Nuanced discussion - Does the response have detailed exploration of the topic and explicit discussion?
</Insightfulness Rules>
<Helpfulness Rules>
- Satisfying user intent – Does the response directly address the user’s request or question?
- Ease of understanding – Is the response fluent, coherent, and logically structured?
- Accuracy – Are the facts, reasoning, and explanations correct?
- Appropriate language – Is the tone suitable and professional, without unnecessary jargon or confusing phrasing?
</Helpfulness Rules>

[Citation Rules]
- Assign each unique URL a single citation number in your text.
- End with ### Sources that lists each source with corresponding numbers.
- Include the URL in the ### Sources section only. Use the citation number in the other sections.
- IMPORTANT: Number sources sequentially without gaps (1,2,3,4...) in the final list.
- Each source should be a separate line item.
- Example format:
[1] Source Title: URL
[2] Source Title: URL
- Citations are extremely important. Pay a lot of attention to getting these right.
- Plan the overall structure of the final report.

=== BEGIN YOUR RESPONSE ===
"""

final_report_generation_basic_denoise_prompt = """
You are an expert Deep Research Analyst and meticulous Fact-Checker.
Your task is to synthesize research findings, resolve conflicts, and produce a highly insightful, publication-ready report based on a research brief and an initial draft.
final_report_generation_step2_prompt = """
You are an expert Deep Research Analyst and meticulous Report Writer.
Your task is to write a highly insightful, publication-ready final report based on a research brief, an initial draft, and research findings or synthesis.

CRITICAL: The final report MUST be written in the same language as the human messages!
For example, if the user's messages are in English, then MAKE SURE you write your response in English. If the user's messages are in Chinese, then MAKE SURE you write your entire response in Chinese.
Expand All @@ -493,22 +443,12 @@
{draft_report}
</Draft Report>

<Findings>
{findings}
</Findings>
<Fact-Checking Synthesis or Raw Findings>
{synthesis_or_findings}
</Fact-Checking Synthesis or Raw Findings>

=== EXECUTION INSTRUCTIONS ===
You must process this request in a single response, strictly following these steps to denoise the draft report using findings:

### Step 1: Synthesis & Conflict Resolution (Chain-of-Thought)
Wrap your thinking process in `<reasoning>` tags.
- Think step-by-step about how to update the draft report and integrate the newly discovered findings.
- Address any contradictions: How will you correct the draft based on the findings?
- Address new information: Where is the best place to insert this new information?
- Plan the overall structure of your final report based on the structural examples provided below.

### Step 2: Final Report Generation
Now, write the final detailed answer to the overall research brief based on your reasoning.
Based on the <Fact-Checking Synthesis or Raw Findings> provided above, write the final detailed answer to the overall research brief.

[Structure Guidance]
You can structure your report in a number of different ways. Here are some examples:
Expand Down
31 changes: 21 additions & 10 deletions deep_research/research_agent_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

from deep_research.utils import get_today_str
from deep_research.prompts import (
final_report_generation_with_helpfulness_insightfulness_hit_citation_prompt,
final_report_generation_basic_denoise_prompt
final_report_generation_step1_prompt,
final_report_generation_step2_prompt
)
from deep_research.state_scope import AgentState, AgentInputState
from deep_research.research_agent_scope import clarify_with_user, write_research_brief, write_draft_report
Expand Down Expand Up @@ -52,23 +52,34 @@ async def final_report_generation(state: AgentState):

findings = "\n".join(notes)

if os.getenv("BASIC_REPORT_DENOISING", "false").lower() == "true":
final_report_prompt = final_report_generation_basic_denoise_prompt.format(
if os.getenv("DISABLE_REPORT_DENOISING", "false").lower() == "true":
# Ablation baseline: Directly pass findings without Synthesis (Step 1)
writer_prompt = final_report_generation_step2_prompt.format(
research_brief=state.get("research_brief", ""),
findings=findings,
synthesis_or_findings=findings,
date=get_today_str(),
draft_report=state.get("draft_report", "")
)
final_report = await writer_model.ainvoke([HumanMessage(content=writer_prompt)])
else:
final_report_prompt = final_report_generation_with_helpfulness_insightfulness_hit_citation_prompt.format(
# Phase 1: Analysis & Synthesis (Evidence Extraction & Conflict Resolution)
step1_prompt = final_report_generation_step1_prompt.format(
research_brief=state.get("research_brief", ""),
findings=findings,
date=get_today_str(),
draft_report=state.get("draft_report", ""),
user_request=state.get("user_request", "")
draft_report=state.get("draft_report", "")
)

final_report = await writer_model.ainvoke([HumanMessage(content=final_report_prompt)])
step1_response = await writer_model.ainvoke([HumanMessage(content=step1_prompt)])
synthesis = step1_response.content

# Phase 2: Final Report Generation
step2_prompt = final_report_generation_step2_prompt.format(
research_brief=state.get("research_brief", ""),
synthesis_or_findings=synthesis,
date=get_today_str(),
draft_report=state.get("draft_report", "")
)
final_report = await writer_model.ainvoke([HumanMessage(content=step2_prompt)])

return {
"final_report": final_report.content,
Expand Down
4 changes: 2 additions & 2 deletions deep_research/state_multi_agent_supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pydantic import BaseModel, Field
import os

ENABLE_VERIFICATION = os.getenv("ENABLE_VERIFICATION", "true").lower() == "true"
DISABLE_VERIFICATION = os.getenv("DISABLE_VERIFICATION", "false").lower() == "true"

class SupervisorState(TypedDict):
"""
Expand All @@ -38,7 +38,7 @@ class SupervisorState(TypedDict):
# Draft report
draft_report: str

if ENABLE_VERIFICATION:
if not DISABLE_VERIFICATION:
@tool
class ConductResearch(BaseModel):
"""Tool for delegating a research task to a specialized sub-agent."""
Expand Down