From a4953e3b58f7353f5b96f6c4c9d3be123a9e4f6f Mon Sep 17 00:00:00 2001 From: Syed Hussain Ather Date: Sun, 7 Dec 2025 18:32:39 -0500 Subject: [PATCH 1/7] Add SciAgent scientific research flow configuration --- configs/config.yaml | 4 +- configs/statemachines/flows/sciagent.yaml | 54 +++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 configs/statemachines/flows/sciagent.yaml diff --git a/configs/config.yaml b/configs/config.yaml index bac92858..6aae1d13 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -52,6 +52,8 @@ flows: enabled: false jina_ai: enabled: false + sciagent: + enabled: false # Output configuration outputs: @@ -68,4 +70,4 @@ performance: enable_parallel_execution: true enable_result_caching: true cache_ttl: 3600 # 1 hour - enable_workflow_optimization: true \ No newline at end of file + enable_workflow_optimization: true diff --git a/configs/statemachines/flows/sciagent.yaml b/configs/statemachines/flows/sciagent.yaml new file mode 100644 index 00000000..bac7f918 --- /dev/null +++ b/configs/statemachines/flows/sciagent.yaml @@ -0,0 +1,54 @@ +# @package _global_ +# SciAgent: generic scientific research flow +# High-level, domain-agnostic research pipeline: +# Parse → Hypothesize → Search → Analyze → Synthesize → Dataset logging + +enabled: true + +params: + max_iterations: 8 + max_hypotheses: 5 + trace_reasoning: true + generate_datasets: true + require_verifiable_sources: true + +stages: + parse: + enabled: true + classify_intent: true # classify: mechanistic, comparative, predictive, etc. + extract_entities: true # genes, proteins, diseases, interventions, etc. + detect_domains: true # e.g. bio, chem, clinical, methods + + hypothesize: + enabled: true + generate_candidates: true # generate multiple candidate hypotheses + rank_by_plausibility: true + ensure_falsifiable: true # enforce falsifiability / testability constraints + + literature_review: + enabled: true + use_deepsearch_flow: true # delegate to deepsearch flow when available + use_rag_flow: true # combine with RAG retrieval + min_primary_sources: 5 + min_review_articles: 1 + + analysis: + enabled: true + aggregate_evidence: true # combine evidence across sources + detect_conflicts: true # detect conflicting findings/claims + assess_evidence_quality: true + + synthesis: + enabled: true + write_answer: true # final answer to the question + write_critical_review: true # more systematic, structured review + write_methods_section: true # methods-style description of how evidence was gathered + + dataset_logging: + enabled: true + save_hypothesis_traces: true + save_review_traces: true + save_methods_traces: true + save_tool_calls: true + save_state_snapshots: true + From a04d760608df352e807ff9cde6cf47115f8e5966 Mon Sep 17 00:00:00 2001 From: Syed Hussain Ather Date: Mon, 8 Dec 2025 14:02:04 -0500 Subject: [PATCH 2/7] sciagent class added --- DeepResearch/app.py | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/DeepResearch/app.py b/DeepResearch/app.py index 5337502b..34fcf06d 100644 --- a/DeepResearch/app.py +++ b/DeepResearch/app.py @@ -1,7 +1,4 @@ -from __future__ import annotations - -import asyncio -from dataclasses import dataclass, field +dataclasses import dataclass, field from typing import Optional, Annotated, List, Dict, Any import hydra @@ -827,6 +824,40 @@ def _extract_summary(self, data_bag: Dict[str, Any], problem: StructuredProblem) return "\n".join(summary_parts) if summary_parts else "Analysis completed with available results." +# --- SciAgent flow nodes --- +@dataclass +class SciAgentParse(BaseNode[ResearchState]): + async def run(self, ctx: GraphRunContext[ResearchState]) -> 'SciAgentExecute': + # Import here to avoid circular imports + from .src.statemachines.sciagent_workflow import run_sciagent_workflow + + question = ctx.state.question + cfg = ctx.state.config + + ctx.state.notes.append("Starting SciAgent workflow") + + # Run the complete SciAgent workflow + try: + final_answer = run_sciagent_workflow(question, cfg) + ctx.state.answers.append(final_answer) + ctx.state.notes.append("SciAgent workflow completed successfully") + except Exception as e: + error_msg = f"SciAgent workflow failed: {str(e)}" + ctx.state.notes.append(error_msg) + ctx.state.answers.append(f"Error: {error_msg}") + + return SciAgentExecute() + + +@dataclass +class SciAgentExecute(BaseNode[ResearchState]): + async def run(self, ctx: GraphRunContext[ResearchState]) -> Annotated[End[str], Edge(label="done")]: + # The SciAgent workflow is already complete, just return the result + if ctx.state.answers: + return End(ctx.state.answers[-1]) + else: + return End("SciAgent analysis completed.") + # --- Bioinformatics flow nodes --- @dataclass From 8e268a68cc06d89731947ed662fa8215495146a5 Mon Sep 17 00:00:00 2001 From: Syed Hussain Ather Date: Mon, 8 Dec 2025 14:07:45 -0500 Subject: [PATCH 3/7] Route SciAgent statemachine flow in app.py --- DeepResearch/app.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/DeepResearch/app.py b/DeepResearch/app.py index 34fcf06d..1f8db194 100644 --- a/DeepResearch/app.py +++ b/DeepResearch/app.py @@ -101,6 +101,12 @@ async def run(self, ctx: GraphRunContext[ResearchState]) -> Union[Search, Primar # Route to RAG flow if enabled rag_cfg = getattr(getattr(cfg, "flows", {}), "rag", None) + # Route to SciAgent flow if enabled + sciagent_cfg = getattr(getattr(cfg, "flows", {}), "sciagent", None) + if getattr(sciagent_cfg or {}, "enabled", False): + ctx.state.notes.append("SciAgent flow enabled") + return SciAgentParse() + if getattr(rag_cfg or {}, "enabled", False): ctx.state.notes.append("RAG flow enabled") return RAGParse() @@ -932,9 +938,17 @@ async def run(self, ctx: GraphRunContext[ResearchState]) -> Annotated[End[str], def run_graph(question: str, cfg: DictConfig) -> str: state = ResearchState(question=question, config=cfg) # Include all nodes in runtime graph - instantiate them - nodes = (Plan(), Search(), Analyze(), Synthesize(), PrepareChallenge(), RunChallenge(), EvaluateChallenge(), - DSPlan(), DSExecute(), DSAnalyze(), DSSynthesize(), PrimeParse(), PrimePlan(), PrimeExecute(), PrimeEvaluate(), - BioinformaticsParse(), BioinformaticsFuse(), RAGParse(), RAGExecute(), PrimaryREACTWorkflow(), EnhancedREACTWorkflow()) + nodes = ( + Plan(), Search(), Analyze(), Synthesize(), + PrepareChallenge(), RunChallenge(), EvaluateChallenge(), + DSPlan(), DSExecute(), DSAnalyze(), DSSynthesize(), + PrimeParse(), PrimePlan(), PrimeExecute(), PrimeEvaluate(), + BioinformaticsParse(), BioinformaticsFuse(), + SciAgentParse(), SciAgentExecute(), + RAGParse(), RAGExecute(), + PrimaryREACTWorkflow(), EnhancedREACTWorkflow() + ) + g = Graph(nodes=nodes, state_type=ResearchState) result = asyncio.run(g.run(Plan(), state=state)) return result.output From 150dd8544d5576a55caca70e76cf7632a1fbbb6e Mon Sep 17 00:00:00 2001 From: Syed Hussain Ather Date: Mon, 8 Dec 2025 14:13:35 -0500 Subject: [PATCH 4/7] Add minimal SciAgent workflow stub with dataset logging --- .../src/statemachines/sciagent_workflow.py | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 DeepResearch/src/statemachines/sciagent_workflow.py diff --git a/DeepResearch/src/statemachines/sciagent_workflow.py b/DeepResearch/src/statemachines/sciagent_workflow.py new file mode 100644 index 00000000..d8d2292a --- /dev/null +++ b/DeepResearch/src/statemachines/sciagent_workflow.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import json +import os +from dataclasses import asdict, dataclass +from datetime import datetime +from typing import Any, Dict, List, Optional + +from omegaconf import DictConfig + + +@dataclass +class SciAgentStep: + """Single step in the SciAgent reasoning trace.""" + name: str + description: str + metadata: Dict[str, Any] + + +@dataclass +class SciAgentTrace: + """Full trace for one SciAgent run.""" + question: str + created_at: str + config_snapshot: Dict[str, Any] + steps: List[SciAgentStep] + final_answer: str + + +def _get_sciagent_cfg(cfg: Optional[DictConfig]) -> Dict[str, Any]: + """ + Safely extract the SciAgent config block from the global Hydra config. + + Expected structure (in configs/config.yaml): + + flows: + sciagent: + enabled: true + params: + max_iterations: 8 + trace_reasoning: true + generate_datasets: true + require_verifiable_sources: true + """ + if cfg is None: + return {} + + flows_cfg = getattr(cfg, "flows", None) + sciagent_cfg = getattr(flows_cfg, "sciagent", None) + if sciagent_cfg is None: + return {} + + # Convert to plain dict to avoid leaking OmegaConf objects around + try: + from omegaconf import OmegaConf + return OmegaConf.to_container(sciagent_cfg, resolve=True) # type: ignore + except Exception: + # Fallback: best-effort conversion + return dict(sciagent_cfg) + + +def _maybe_write_dataset(trace: SciAgentTrace, sci_cfg: Dict[str, Any]) -> None: + """ + Optionally write a JSONL dataset row for this run. + + Controlled by flows.sciagent.params.generate_datasets. + """ + params = sci_cfg.get("params", {}) if isinstance(sci_cfg, dict) else {} + generate = params.get("generate_datasets", False) + if not generate: + return + + out_dir = os.path.join("outputs", "datasets", "sciagent") + os.makedirs(out_dir, exist_ok=True) + + # Simple filename with timestamp + ts = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ") + filename = os.path.join(out_dir, f"sciagent_{ts}.jsonl") + + payload = asdict(trace) + with open(filename, "a", encoding="utf-8") as f: + f.write(json.dumps(payload) + "\n") + + +def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> str: + """ + Minimal SciAgent workflow. + + For now this is a lightweight, single-function orchestration that: + - Builds a structured reasoning trace with placeholder steps + - Optionally writes the trace as a dataset row + - Returns a human-readable answer string + + This is intentionally simple and safe; future work can: + - Call DeepSearch / RAG flows + - Integrate PRIME / Bioinformatics tools + - Use Pydantic Graph for multi-node execution + """ + sci_cfg = _get_sciagent_cfg(cfg) + + params = sci_cfg.get("params", {}) if isinstance(sci_cfg, dict) else {} + max_iterations = params.get("max_iterations", 4) + trace_reasoning = params.get("trace_reasoning", True) + + steps: List[SciAgentStep] = [] + + # --- Step 1: Parse --- + steps.append( + SciAgentStep( + name="parse_question", + description="Parsed the research question and identified key entities and intent.", + metadata={ + "question": question, + "detected_intent": "general_scientific_reasoning", + "entities": [], # placeholder for future NER / ontology mapping + }, + ) + ) + + # --- Step 2: Generate hypotheses (placeholder) --- + hypotheses = [ + "H1: The observed phenomenon is regulated by multiple interacting pathways.", + "H2: There exists a dominant mechanism with context-dependent modifiers.", + ] + steps.append( + SciAgentStep( + name="generate_hypotheses", + description="Generated a small set of candidate hypotheses.", + metadata={ + "hypotheses": hypotheses, + "max_iterations": max_iterations, + }, + ) + ) + + # --- Step 3: Literature / evidence planning (placeholder) --- + steps.append( + SciAgentStep( + name="plan_evidence_collection", + description="Outlined a plan to gather evidence via DeepSearch / RAG flows.", + metadata={ + "planned_sources": [ + "PubMed", + "preprint servers", + "review articles", + "databases (GO, PDB, etc.)", + ], + "use_deepsearch_flow": True, + "use_rag_flow": True, + }, + ) + ) + + # --- Step 4: Synthesis (placeholder) --- + synthesized_answer = ( + "SciAgent (minimal stub) analyzed your question and constructed a " + "generic scientific reasoning template. The current implementation " + "does not yet execute full DeepSearch / PRIME / Bioinformatics flows, " + "but it prepares structured hypotheses and an evidence-collection plan " + "that future versions will use for verifiable, tool-grounded reasoning." + ) + steps.append( + SciAgentStep( + name="synthesize_answer", + description="Synthesized a high-level answer based on the placeholder pipeline.", + metadata={ + "hypotheses_considered": len(hypotheses), + "trace_reasoning": trace_reasoning, + }, + ) + ) + + trace = SciAgentTrace( + question=question, + created_at=datetime.utcnow().isoformat() + "Z", + config_snapshot=sci_cfg, + steps=steps if trace_reasoning else [], + final_answer=synthesized_answer, + ) + + # Optional dataset logging + _maybe_write_dataset(trace, sci_cfg) + + return synthesized_answer + From 29bb18833ae7a44ee8466963f9af7dd1885078c7 Mon Sep 17 00:00:00 2001 From: Syed Hussain Ather Date: Mon, 8 Dec 2025 14:41:31 -0500 Subject: [PATCH 5/7] Enrich SciAgent trace with structured hypotheses, evidence plan, and methods plan --- .../src/statemachines/sciagent_workflow.py | 121 +++++++++++++++--- 1 file changed, 101 insertions(+), 20 deletions(-) diff --git a/DeepResearch/src/statemachines/sciagent_workflow.py b/DeepResearch/src/statemachines/sciagent_workflow.py index d8d2292a..d82742bb 100644 --- a/DeepResearch/src/statemachines/sciagent_workflow.py +++ b/DeepResearch/src/statemachines/sciagent_workflow.py @@ -17,6 +17,25 @@ class SciAgentStep: metadata: Dict[str, Any] +@dataclass +class Hypothesis: + """Structured representation of a single hypothesis.""" + id: str + statement: str + rationale: str + prior_confidence: float # 0.0–1.0 + + +@dataclass +class EvidencePlanItem: + """Planned evidence source / query.""" + id: str + source: str # e.g. "PubMed", "GO", "PDB" + description: str + query: str + priority: int # 1 = highest + + @dataclass class SciAgentTrace: """Full trace for one SciAgent run.""" @@ -24,6 +43,9 @@ class SciAgentTrace: created_at: str config_snapshot: Dict[str, Any] steps: List[SciAgentStep] + hypotheses: List[Hypothesis] + evidence_plan: List[EvidencePlanItem] + methods_plan: str final_answer: str @@ -88,6 +110,7 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st For now this is a lightweight, single-function orchestration that: - Builds a structured reasoning trace with placeholder steps + - Populates structured hypotheses, evidence plan, and methods plan - Optionally writes the trace as a dataset row - Returns a human-readable answer string @@ -117,48 +140,103 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st ) ) - # --- Step 2: Generate hypotheses (placeholder) --- - hypotheses = [ - "H1: The observed phenomenon is regulated by multiple interacting pathways.", - "H2: There exists a dominant mechanism with context-dependent modifiers.", + # --- Step 2: Generate structured hypotheses (placeholder content) --- + hypotheses: List[Hypothesis] = [ + Hypothesis( + id="H1", + statement="The observed phenomenon is regulated by multiple interacting pathways.", + rationale="Many biological and physical systems show emergent behavior arising from pathway interactions.", + prior_confidence=0.6, + ), + Hypothesis( + id="H2", + statement="A single dominant mechanism explains most of the observed variance, with context-dependent modifiers.", + rationale="Often one major factor explains the bulk of the effect size, while other modifiers fine-tune outcomes.", + prior_confidence=0.4, + ), ] + steps.append( SciAgentStep( name="generate_hypotheses", - description="Generated a small set of candidate hypotheses.", + description="Generated a small set of candidate hypotheses with rationales and prior confidence.", metadata={ - "hypotheses": hypotheses, + "hypotheses": [asdict(h) for h in hypotheses], "max_iterations": max_iterations, }, ) ) - # --- Step 3: Literature / evidence planning (placeholder) --- + # --- Step 3: Plan evidence collection (placeholder, but structured) --- + evidence_plan: List[EvidencePlanItem] = [ + EvidencePlanItem( + id="E1", + source="PubMed", + description="Retrieve primary research articles and recent reviews relevant to the question.", + query=f'{question} review[pt] OR mechanistic[tiab]', + priority=1, + ), + EvidencePlanItem( + id="E2", + source="Preprint servers", + description="Search preprints for cutting-edge or not-yet-reviewed studies.", + query=question, + priority=2, + ), + EvidencePlanItem( + id="E3", + source="Databases", + description="Query structured databases (GO, PDB, etc.) if the question involves genes/proteins.", + query="entity-specific queries to GO / PDB / related resources", + priority=3, + ), + ] + steps.append( SciAgentStep( name="plan_evidence_collection", - description="Outlined a plan to gather evidence via DeepSearch / RAG flows.", + description="Outlined a plan to gather evidence via DeepSearch / RAG and structured databases.", metadata={ - "planned_sources": [ - "PubMed", - "preprint servers", - "review articles", - "databases (GO, PDB, etc.)", - ], + "evidence_plan": [asdict(e) for e in evidence_plan], "use_deepsearch_flow": True, "use_rag_flow": True, }, ) ) - # --- Step 4: Synthesis (placeholder) --- + # --- Step 4: Methods-style plan (for SFT/DPO later) --- + methods_plan = ( + "1. Formulate the research question precisely and identify key entities.\n" + "2. Generate multiple mechanistic hypotheses with explicit rationales and prior confidence.\n" + "3. Design an evidence collection strategy using:\n" + " - PubMed for primary literature and reviews\n" + " - Preprint servers for recent, not-yet-reviewed work\n" + " - Structured databases (e.g., GO, PDB) when genes/proteins are involved\n" + "4. Retrieve and filter evidence based on study quality, recency, and relevance.\n" + "5. Map each piece of evidence to supporting or refuting specific hypotheses.\n" + "6. Reweight hypothesis confidence based on the aggregated evidence.\n" + "7. Synthesize a critical narrative that explains which hypothesis is best supported and why." + ) + + steps.append( + SciAgentStep( + name="construct_methods_plan", + description="Defined a methods-style plan for how SciAgent should collect and evaluate evidence.", + metadata={ + "methods_plan": methods_plan, + }, + ) + ) + + # --- Step 5: Synthesis (placeholder text for now) --- synthesized_answer = ( - "SciAgent (minimal stub) analyzed your question and constructed a " - "generic scientific reasoning template. The current implementation " - "does not yet execute full DeepSearch / PRIME / Bioinformatics flows, " - "but it prepares structured hypotheses and an evidence-collection plan " - "that future versions will use for verifiable, tool-grounded reasoning." + "SciAgent (minimal stub) analyzed your question and constructed a generic scientific " + "reasoning template. It generated structured hypotheses, an evidence collection plan, " + "and a methods-style evaluation procedure. Future versions will plug this plan into " + "tool-grounded flows (DeepSearch, RAG, PRIME, Bioinformatics) to produce fully " + "verifiable, data-backed conclusions." ) + steps.append( SciAgentStep( name="synthesize_answer", @@ -175,6 +253,9 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st created_at=datetime.utcnow().isoformat() + "Z", config_snapshot=sci_cfg, steps=steps if trace_reasoning else [], + hypotheses=hypotheses, + evidence_plan=evidence_plan, + methods_plan=methods_plan, final_answer=synthesized_answer, ) From 50e4f13c9052108ace972451069a361c1a9be8dc Mon Sep 17 00:00:00 2001 From: Syed Hussain Ather Date: Mon, 8 Dec 2025 14:54:52 -0500 Subject: [PATCH 6/7] Integrate SciAgent workflow with DeepSearch state machine --- .../src/statemachines/sciagent_workflow.py | 66 +++++++++++++++---- configs/statemachines/flows/sciagent.yaml | 1 + 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/DeepResearch/src/statemachines/sciagent_workflow.py b/DeepResearch/src/statemachines/sciagent_workflow.py index d82742bb..dcb38324 100644 --- a/DeepResearch/src/statemachines/sciagent_workflow.py +++ b/DeepResearch/src/statemachines/sciagent_workflow.py @@ -63,6 +63,7 @@ def _get_sciagent_cfg(cfg: Optional[DictConfig]) -> Dict[str, Any]: trace_reasoning: true generate_datasets: true require_verifiable_sources: true + use_deepsearch: true """ if cfg is None: return {} @@ -106,16 +107,16 @@ def _maybe_write_dataset(trace: SciAgentTrace, sci_cfg: Dict[str, Any]) -> None: def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> str: """ - Minimal SciAgent workflow. + SciAgent workflow with DeepSearch integration. For now this is a lightweight, single-function orchestration that: - Builds a structured reasoning trace with placeholder steps - Populates structured hypotheses, evidence plan, and methods plan + - Optionally calls the DeepSearch workflow to gather external evidence - Optionally writes the trace as a dataset row - Returns a human-readable answer string This is intentionally simple and safe; future work can: - - Call DeepSearch / RAG flows - Integrate PRIME / Bioinformatics tools - Use Pydantic Graph for multi-node execution """ @@ -124,6 +125,7 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st params = sci_cfg.get("params", {}) if isinstance(sci_cfg, dict) else {} max_iterations = params.get("max_iterations", 4) trace_reasoning = params.get("trace_reasoning", True) + use_deepsearch = params.get("use_deepsearch", True) steps: List[SciAgentStep] = [] @@ -198,13 +200,40 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st description="Outlined a plan to gather evidence via DeepSearch / RAG and structured databases.", metadata={ "evidence_plan": [asdict(e) for e in evidence_plan], - "use_deepsearch_flow": True, - "use_rag_flow": True, + "use_deepsearch_flow": use_deepsearch, }, ) ) - # --- Step 4: Methods-style plan (for SFT/DPO later) --- + # --- Optional Step 4: Call DeepSearch workflow --- + deepsearch_answer: Optional[str] = None + if use_deepsearch: + try: + # Local import to avoid any potential circular imports at module load time + from .deepsearch_workflow import run_deepsearch_workflow + + deepsearch_answer = run_deepsearch_workflow(question, cfg) + steps.append( + SciAgentStep( + name="deepsearch_execution", + description="Executed DeepSearch workflow to collect and synthesize external evidence.", + metadata={ + "deepsearch_answer": deepsearch_answer, + }, + ) + ) + except Exception as e: + steps.append( + SciAgentStep( + name="deepsearch_error", + description="Attempted to run DeepSearch workflow but it failed.", + metadata={ + "error": str(e), + }, + ) + ) + + # --- Step 5: Methods-style plan (for SFT/DPO later) --- methods_plan = ( "1. Formulate the research question precisely and identify key entities.\n" "2. Generate multiple mechanistic hypotheses with explicit rationales and prior confidence.\n" @@ -228,14 +257,24 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st ) ) - # --- Step 5: Synthesis (placeholder text for now) --- - synthesized_answer = ( - "SciAgent (minimal stub) analyzed your question and constructed a generic scientific " - "reasoning template. It generated structured hypotheses, an evidence collection plan, " - "and a methods-style evaluation procedure. Future versions will plug this plan into " - "tool-grounded flows (DeepSearch, RAG, PRIME, Bioinformatics) to produce fully " - "verifiable, data-backed conclusions." - ) + # --- Step 6: Synthesis (DeepSearch-aware text for now) --- + if deepsearch_answer: + synthesized_answer = ( + "SciAgent used the DeepSearch workflow to gather external evidence and produced the " + "following synthesized summary:\n\n" + f"{deepsearch_answer}\n\n" + "In addition, SciAgent constructed structured hypotheses, an evidence collection plan, " + "and a methods-style evaluation procedure that can be reused for dataset generation " + "and future tool-grounded reasoning." + ) + else: + synthesized_answer = ( + "SciAgent (minimal stub) analyzed your question and constructed a generic scientific " + "reasoning template. It generated structured hypotheses, an evidence collection plan, " + "and a methods-style evaluation procedure. Future versions will plug this plan into " + "tool-grounded flows (DeepSearch, RAG, PRIME, Bioinformatics) to produce fully " + "verifiable, data-backed conclusions." + ) steps.append( SciAgentStep( @@ -244,6 +283,7 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st metadata={ "hypotheses_considered": len(hypotheses), "trace_reasoning": trace_reasoning, + "used_deepsearch": bool(deepsearch_answer), }, ) ) diff --git a/configs/statemachines/flows/sciagent.yaml b/configs/statemachines/flows/sciagent.yaml index bac7f918..90c4dce8 100644 --- a/configs/statemachines/flows/sciagent.yaml +++ b/configs/statemachines/flows/sciagent.yaml @@ -11,6 +11,7 @@ params: trace_reasoning: true generate_datasets: true require_verifiable_sources: true + use_deepsearch: true stages: parse: From 2fe49267e3a5273e8f83438b8afb119aab010854 Mon Sep 17 00:00:00 2001 From: Syed Hussain Ather Date: Mon, 8 Dec 2025 15:02:37 -0500 Subject: [PATCH 7/7] Integrate SciAgent workflow with DeepSearch state machine with Bioinformatics/PRIME pipeline --- .../src/statemachines/sciagent_workflow.py | 111 +++++++++++++++--- configs/statemachines/flows/sciagent.yaml | 2 + 2 files changed, 99 insertions(+), 14 deletions(-) diff --git a/DeepResearch/src/statemachines/sciagent_workflow.py b/DeepResearch/src/statemachines/sciagent_workflow.py index dcb38324..9304211e 100644 --- a/DeepResearch/src/statemachines/sciagent_workflow.py +++ b/DeepResearch/src/statemachines/sciagent_workflow.py @@ -64,6 +64,8 @@ def _get_sciagent_cfg(cfg: Optional[DictConfig]) -> Dict[str, Any]: generate_datasets: true require_verifiable_sources: true use_deepsearch: true + use_prime: false + use_bioinformatics: false """ if cfg is None: return {} @@ -107,18 +109,19 @@ def _maybe_write_dataset(trace: SciAgentTrace, sci_cfg: Dict[str, Any]) -> None: def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> str: """ - SciAgent workflow with DeepSearch integration. + SciAgent workflow with DeepSearch + PRIME + Bioinformatics integration. For now this is a lightweight, single-function orchestration that: - Builds a structured reasoning trace with placeholder steps - Populates structured hypotheses, evidence plan, and methods plan - - Optionally calls the DeepSearch workflow to gather external evidence + - Optionally calls DeepSearch, PRIME, and Bioinformatics workflows - Optionally writes the trace as a dataset row - Returns a human-readable answer string This is intentionally simple and safe; future work can: - - Integrate PRIME / Bioinformatics tools + - Add smarter domain routing (e.g. automatic protein question detection) - Use Pydantic Graph for multi-node execution + - Feed tool-grounded results back into hypothesis scoring """ sci_cfg = _get_sciagent_cfg(cfg) @@ -126,6 +129,8 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st max_iterations = params.get("max_iterations", 4) trace_reasoning = params.get("trace_reasoning", True) use_deepsearch = params.get("use_deepsearch", True) + use_prime = params.get("use_prime", False) + use_bioinformatics = params.get("use_bioinformatics", False) steps: List[SciAgentStep] = [] @@ -201,6 +206,8 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st metadata={ "evidence_plan": [asdict(e) for e in evidence_plan], "use_deepsearch_flow": use_deepsearch, + "use_prime_flow": use_prime, + "use_bioinformatics_flow": use_bioinformatics, }, ) ) @@ -209,7 +216,7 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st deepsearch_answer: Optional[str] = None if use_deepsearch: try: - # Local import to avoid any potential circular imports at module load time + # Local import to avoid circular imports at module load time from .deepsearch_workflow import run_deepsearch_workflow deepsearch_answer = run_deepsearch_workflow(question, cfg) @@ -233,7 +240,61 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st ) ) - # --- Step 5: Methods-style plan (for SFT/DPO later) --- + # --- Optional Step 5: Call PRIME workflow (protein engineering) --- + prime_answer: Optional[str] = None + if use_prime: + try: + from .prime_workflow import run_prime_workflow + + prime_answer = run_prime_workflow(question, cfg) + steps.append( + SciAgentStep( + name="prime_execution", + description="Executed PRIME flow for protein engineering / structural reasoning.", + metadata={ + "prime_answer": prime_answer, + }, + ) + ) + except Exception as e: + steps.append( + SciAgentStep( + name="prime_error", + description="Attempted to run PRIME workflow but it failed.", + metadata={ + "error": str(e), + }, + ) + ) + + # --- Optional Step 6: Call Bioinformatics workflow (data fusion) --- + bio_answer: Optional[str] = None + if use_bioinformatics: + try: + from .bioinformatics_workflow import run_bioinformatics_workflow + + bio_answer = run_bioinformatics_workflow(question, cfg) + steps.append( + SciAgentStep( + name="bioinformatics_execution", + description="Executed Bioinformatics flow for multi-source data fusion and integrative reasoning.", + metadata={ + "bioinformatics_answer": bio_answer, + }, + ) + ) + except Exception as e: + steps.append( + SciAgentStep( + name="bioinformatics_error", + description="Attempted to run Bioinformatics workflow but it failed.", + metadata={ + "error": str(e), + }, + ) + ) + + # --- Step 7: Methods-style plan (for SFT/DPO later) --- methods_plan = ( "1. Formulate the research question precisely and identify key entities.\n" "2. Generate multiple mechanistic hypotheses with explicit rationales and prior confidence.\n" @@ -257,33 +318,55 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st ) ) - # --- Step 6: Synthesis (DeepSearch-aware text for now) --- + # --- Step 8: Synthesis (aggregate DeepSearch/PRIME/Bio if present) --- + synthesis_chunks: List[str] = [] + if deepsearch_answer: + synthesis_chunks.append( + "DeepSearch evidence synthesis:\n" + f"{deepsearch_answer}" + ) + + if prime_answer: + synthesis_chunks.append( + "PRIME (protein engineering) synthesis:\n" + f"{prime_answer}" + ) + + if bio_answer: + synthesis_chunks.append( + "Bioinformatics (multi-source data fusion) synthesis:\n" + f"{bio_answer}" + ) + + if synthesis_chunks: synthesized_answer = ( - "SciAgent used the DeepSearch workflow to gather external evidence and produced the " - "following synthesized summary:\n\n" - f"{deepsearch_answer}\n\n" - "In addition, SciAgent constructed structured hypotheses, an evidence collection plan, " - "and a methods-style evaluation procedure that can be reused for dataset generation " - "and future tool-grounded reasoning." + "SciAgent orchestrated multiple domain flows (where enabled) and " + "combined their outputs into a unified scientific summary.\n\n" + + "\n\n".join(synthesis_chunks) + + "\n\nIn addition, SciAgent constructed structured hypotheses, an " + "evidence collection plan, and a methods-style evaluation procedure " + "that can be reused for dataset generation and future tool-grounded reasoning." ) else: synthesized_answer = ( "SciAgent (minimal stub) analyzed your question and constructed a generic scientific " "reasoning template. It generated structured hypotheses, an evidence collection plan, " "and a methods-style evaluation procedure. Future versions will plug this plan into " - "tool-grounded flows (DeepSearch, RAG, PRIME, Bioinformatics) to produce fully " + "tool-grounded flows (DeepSearch, PRIME, Bioinformatics) to produce fully " "verifiable, data-backed conclusions." ) steps.append( SciAgentStep( name="synthesize_answer", - description="Synthesized a high-level answer based on the placeholder pipeline.", + description="Synthesized a high-level answer based on the configured flows and placeholder pipeline.", metadata={ "hypotheses_considered": len(hypotheses), "trace_reasoning": trace_reasoning, "used_deepsearch": bool(deepsearch_answer), + "used_prime": bool(prime_answer), + "used_bioinformatics": bool(bio_answer), }, ) ) diff --git a/configs/statemachines/flows/sciagent.yaml b/configs/statemachines/flows/sciagent.yaml index 90c4dce8..da6ff40e 100644 --- a/configs/statemachines/flows/sciagent.yaml +++ b/configs/statemachines/flows/sciagent.yaml @@ -12,6 +12,8 @@ params: generate_datasets: true require_verifiable_sources: true use_deepsearch: true + use_prime: false + use_bioinformatics: false stages: parse: