From a4953e3b58f7353f5b96f6c4c9d3be123a9e4f6f Mon Sep 17 00:00:00 2001
From: Syed Hussain Ather <shussainather@gmail.com>
Date: Sun, 7 Dec 2025 18:32:39 -0500
Subject: [PATCH 1/7] Add SciAgent scientific research flow configuration

---
 configs/config.yaml                       |  4 +-
 configs/statemachines/flows/sciagent.yaml | 54 +++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 configs/statemachines/flows/sciagent.yaml

diff --git a/configs/config.yaml b/configs/config.yaml
index bac92858..6aae1d13 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -52,6 +52,8 @@ flows:
     enabled: false
   jina_ai:
     enabled: false
+  sciagent:
+    enabled: false 
 
 # Output configuration
 outputs:
@@ -68,4 +70,4 @@ performance:
   enable_parallel_execution: true
   enable_result_caching: true
   cache_ttl: 3600  # 1 hour
-  enable_workflow_optimization: true
\ No newline at end of file
+  enable_workflow_optimization: true
diff --git a/configs/statemachines/flows/sciagent.yaml b/configs/statemachines/flows/sciagent.yaml
new file mode 100644
index 00000000..bac7f918
--- /dev/null
+++ b/configs/statemachines/flows/sciagent.yaml
@@ -0,0 +1,54 @@
+# @package _global_
+# SciAgent: generic scientific research flow
+# High-level, domain-agnostic research pipeline:
+# Parse → Hypothesize → Search → Analyze → Synthesize → Dataset logging
+
+enabled: true
+
+params:
+  max_iterations: 8
+  max_hypotheses: 5
+  trace_reasoning: true
+  generate_datasets: true
+  require_verifiable_sources: true
+
+stages:
+  parse:
+    enabled: true
+    classify_intent: true        # classify: mechanistic, comparative, predictive, etc.
+    extract_entities: true       # genes, proteins, diseases, interventions, etc.
+    detect_domains: true         # e.g. bio, chem, clinical, methods
+
+  hypothesize:
+    enabled: true
+    generate_candidates: true    # generate multiple candidate hypotheses
+    rank_by_plausibility: true
+    ensure_falsifiable: true     # enforce falsifiability / testability constraints
+
+  literature_review:
+    enabled: true
+    use_deepsearch_flow: true    # delegate to deepsearch flow when available
+    use_rag_flow: true           # combine with RAG retrieval
+    min_primary_sources: 5
+    min_review_articles: 1
+
+  analysis:
+    enabled: true
+    aggregate_evidence: true     # combine evidence across sources
+    detect_conflicts: true       # detect conflicting findings/claims
+    assess_evidence_quality: true
+
+  synthesis:
+    enabled: true
+    write_answer: true           # final answer to the question
+    write_critical_review: true  # more systematic, structured review
+    write_methods_section: true  # methods-style description of how evidence was gathered
+
+  dataset_logging:
+    enabled: true
+    save_hypothesis_traces: true
+    save_review_traces: true
+    save_methods_traces: true
+    save_tool_calls: true
+    save_state_snapshots: true
+

From a04d760608df352e807ff9cde6cf47115f8e5966 Mon Sep 17 00:00:00 2001
From: Syed Hussain Ather <shussainather@gmail.com>
Date: Mon, 8 Dec 2025 14:02:04 -0500
Subject: [PATCH 2/7] sciagent class added

---
 DeepResearch/app.py | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/DeepResearch/app.py b/DeepResearch/app.py
index 5337502b..34fcf06d 100644
--- a/DeepResearch/app.py
+++ b/DeepResearch/app.py
@@ -1,7 +1,4 @@
-from __future__ import annotations
-
-import asyncio
-from dataclasses import dataclass, field
+dataclasses import dataclass, field
 from typing import Optional, Annotated, List, Dict, Any
 
 import hydra
@@ -827,6 +824,40 @@ def _extract_summary(self, data_bag: Dict[str, Any], problem: StructuredProblem)
         
         return "\n".join(summary_parts) if summary_parts else "Analysis completed with available results."
 
+# --- SciAgent flow nodes ---
+@dataclass
+class SciAgentParse(BaseNode[ResearchState]):
+    async def run(self, ctx: GraphRunContext[ResearchState]) -> 'SciAgentExecute':
+        # Import here to avoid circular imports
+        from .src.statemachines.sciagent_workflow import run_sciagent_workflow
+
+        question = ctx.state.question
+        cfg = ctx.state.config
+
+        ctx.state.notes.append("Starting SciAgent workflow")
+
+        # Run the complete SciAgent workflow
+        try:
+            final_answer = run_sciagent_workflow(question, cfg)
+            ctx.state.answers.append(final_answer)
+            ctx.state.notes.append("SciAgent workflow completed successfully")
+        except Exception as e:
+            error_msg = f"SciAgent workflow failed: {str(e)}"
+            ctx.state.notes.append(error_msg)
+            ctx.state.answers.append(f"Error: {error_msg}")
+
+        return SciAgentExecute()
+
+
+@dataclass
+class SciAgentExecute(BaseNode[ResearchState]):
+    async def run(self, ctx: GraphRunContext[ResearchState]) -> Annotated[End[str], Edge(label="done")]:
+        # The SciAgent workflow is already complete, just return the result
+        if ctx.state.answers:
+            return End(ctx.state.answers[-1])
+        else:
+            return End("SciAgent analysis completed.")
+
 
 # --- Bioinformatics flow nodes ---
 @dataclass

From 8e268a68cc06d89731947ed662fa8215495146a5 Mon Sep 17 00:00:00 2001
From: Syed Hussain Ather <shussainather@gmail.com>
Date: Mon, 8 Dec 2025 14:07:45 -0500
Subject: [PATCH 3/7] Route SciAgent statemachine flow in app.py

---
 DeepResearch/app.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/DeepResearch/app.py b/DeepResearch/app.py
index 34fcf06d..1f8db194 100644
--- a/DeepResearch/app.py
+++ b/DeepResearch/app.py
@@ -101,6 +101,12 @@ async def run(self, ctx: GraphRunContext[ResearchState]) -> Union[Search, Primar
         
         # Route to RAG flow if enabled
         rag_cfg = getattr(getattr(cfg, "flows", {}), "rag", None)
+        # Route to SciAgent flow if enabled
+        sciagent_cfg = getattr(getattr(cfg, "flows", {}), "sciagent", None)
+        if getattr(sciagent_cfg or {}, "enabled", False):
+            ctx.state.notes.append("SciAgent flow enabled")
+            return SciAgentParse()
+
         if getattr(rag_cfg or {}, "enabled", False):
             ctx.state.notes.append("RAG flow enabled")
             return RAGParse()
@@ -932,9 +938,17 @@ async def run(self, ctx: GraphRunContext[ResearchState]) -> Annotated[End[str],
 def run_graph(question: str, cfg: DictConfig) -> str:
     state = ResearchState(question=question, config=cfg)
     # Include all nodes in runtime graph - instantiate them
-    nodes = (Plan(), Search(), Analyze(), Synthesize(), PrepareChallenge(), RunChallenge(), EvaluateChallenge(),
-             DSPlan(), DSExecute(), DSAnalyze(), DSSynthesize(), PrimeParse(), PrimePlan(), PrimeExecute(), PrimeEvaluate(),
-             BioinformaticsParse(), BioinformaticsFuse(), RAGParse(), RAGExecute(), PrimaryREACTWorkflow(), EnhancedREACTWorkflow())
+    nodes = (
+        Plan(), Search(), Analyze(), Synthesize(),
+        PrepareChallenge(), RunChallenge(), EvaluateChallenge(),
+        DSPlan(), DSExecute(), DSAnalyze(), DSSynthesize(),
+        PrimeParse(), PrimePlan(), PrimeExecute(), PrimeEvaluate(),
+        BioinformaticsParse(), BioinformaticsFuse(),
+        SciAgentParse(), SciAgentExecute(),
+        RAGParse(), RAGExecute(),
+        PrimaryREACTWorkflow(), EnhancedREACTWorkflow()
+    )
+
     g = Graph(nodes=nodes, state_type=ResearchState)
     result = asyncio.run(g.run(Plan(), state=state))
     return result.output

From 150dd8544d5576a55caca70e76cf7632a1fbbb6e Mon Sep 17 00:00:00 2001
From: Syed Hussain Ather <shussainather@gmail.com>
Date: Mon, 8 Dec 2025 14:13:35 -0500
Subject: [PATCH 4/7] Add minimal SciAgent workflow stub with dataset logging

---
 .../src/statemachines/sciagent_workflow.py    | 185 ++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 DeepResearch/src/statemachines/sciagent_workflow.py

diff --git a/DeepResearch/src/statemachines/sciagent_workflow.py b/DeepResearch/src/statemachines/sciagent_workflow.py
new file mode 100644
index 00000000..d8d2292a
--- /dev/null
+++ b/DeepResearch/src/statemachines/sciagent_workflow.py
@@ -0,0 +1,185 @@
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from omegaconf import DictConfig
+
+
+@dataclass
+class SciAgentStep:
+    """Single step in the SciAgent reasoning trace."""
+    name: str
+    description: str
+    metadata: Dict[str, Any]
+
+
+@dataclass
+class SciAgentTrace:
+    """Full trace for one SciAgent run."""
+    question: str
+    created_at: str
+    config_snapshot: Dict[str, Any]
+    steps: List[SciAgentStep]
+    final_answer: str
+
+
+def _get_sciagent_cfg(cfg: Optional[DictConfig]) -> Dict[str, Any]:
+    """
+    Safely extract the SciAgent config block from the global Hydra config.
+
+    Expected structure (in configs/config.yaml):
+
+    flows:
+      sciagent:
+        enabled: true
+        params:
+          max_iterations: 8
+          trace_reasoning: true
+          generate_datasets: true
+          require_verifiable_sources: true
+    """
+    if cfg is None:
+        return {}
+
+    flows_cfg = getattr(cfg, "flows", None)
+    sciagent_cfg = getattr(flows_cfg, "sciagent", None)
+    if sciagent_cfg is None:
+        return {}
+
+    # Convert to plain dict to avoid leaking OmegaConf objects around
+    try:
+        from omegaconf import OmegaConf
+        return OmegaConf.to_container(sciagent_cfg, resolve=True)  # type: ignore
+    except Exception:
+        # Fallback: best-effort conversion
+        return dict(sciagent_cfg)
+
+
+def _maybe_write_dataset(trace: SciAgentTrace, sci_cfg: Dict[str, Any]) -> None:
+    """
+    Optionally write a JSONL dataset row for this run.
+
+    Controlled by flows.sciagent.params.generate_datasets.
+    """
+    params = sci_cfg.get("params", {}) if isinstance(sci_cfg, dict) else {}
+    generate = params.get("generate_datasets", False)
+    if not generate:
+        return
+
+    out_dir = os.path.join("outputs", "datasets", "sciagent")
+    os.makedirs(out_dir, exist_ok=True)
+
+    # Simple filename with timestamp
+    ts = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
+    filename = os.path.join(out_dir, f"sciagent_{ts}.jsonl")
+
+    payload = asdict(trace)
+    with open(filename, "a", encoding="utf-8") as f:
+        f.write(json.dumps(payload) + "\n")
+
+
+def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> str:
+    """
+    Minimal SciAgent workflow.
+
+    For now this is a lightweight, single-function orchestration that:
+      - Builds a structured reasoning trace with placeholder steps
+      - Optionally writes the trace as a dataset row
+      - Returns a human-readable answer string
+
+    This is intentionally simple and safe; future work can:
+      - Call DeepSearch / RAG flows
+      - Integrate PRIME / Bioinformatics tools
+      - Use Pydantic Graph for multi-node execution
+    """
+    sci_cfg = _get_sciagent_cfg(cfg)
+
+    params = sci_cfg.get("params", {}) if isinstance(sci_cfg, dict) else {}
+    max_iterations = params.get("max_iterations", 4)
+    trace_reasoning = params.get("trace_reasoning", True)
+
+    steps: List[SciAgentStep] = []
+
+    # --- Step 1: Parse ---
+    steps.append(
+        SciAgentStep(
+            name="parse_question",
+            description="Parsed the research question and identified key entities and intent.",
+            metadata={
+                "question": question,
+                "detected_intent": "general_scientific_reasoning",
+                "entities": [],  # placeholder for future NER / ontology mapping
+            },
+        )
+    )
+
+    # --- Step 2: Generate hypotheses (placeholder) ---
+    hypotheses = [
+        "H1: The observed phenomenon is regulated by multiple interacting pathways.",
+        "H2: There exists a dominant mechanism with context-dependent modifiers.",
+    ]
+    steps.append(
+        SciAgentStep(
+            name="generate_hypotheses",
+            description="Generated a small set of candidate hypotheses.",
+            metadata={
+                "hypotheses": hypotheses,
+                "max_iterations": max_iterations,
+            },
+        )
+    )
+
+    # --- Step 3: Literature / evidence planning (placeholder) ---
+    steps.append(
+        SciAgentStep(
+            name="plan_evidence_collection",
+            description="Outlined a plan to gather evidence via DeepSearch / RAG flows.",
+            metadata={
+                "planned_sources": [
+                    "PubMed",
+                    "preprint servers",
+                    "review articles",
+                    "databases (GO, PDB, etc.)",
+                ],
+                "use_deepsearch_flow": True,
+                "use_rag_flow": True,
+            },
+        )
+    )
+
+    # --- Step 4: Synthesis (placeholder) ---
+    synthesized_answer = (
+        "SciAgent (minimal stub) analyzed your question and constructed a "
+        "generic scientific reasoning template. The current implementation "
+        "does not yet execute full DeepSearch / PRIME / Bioinformatics flows, "
+        "but it prepares structured hypotheses and an evidence-collection plan "
+        "that future versions will use for verifiable, tool-grounded reasoning."
+    )
+    steps.append(
+        SciAgentStep(
+            name="synthesize_answer",
+            description="Synthesized a high-level answer based on the placeholder pipeline.",
+            metadata={
+                "hypotheses_considered": len(hypotheses),
+                "trace_reasoning": trace_reasoning,
+            },
+        )
+    )
+
+    trace = SciAgentTrace(
+        question=question,
+        created_at=datetime.utcnow().isoformat() + "Z",
+        config_snapshot=sci_cfg,
+        steps=steps if trace_reasoning else [],
+        final_answer=synthesized_answer,
+    )
+
+    # Optional dataset logging
+    _maybe_write_dataset(trace, sci_cfg)
+
+    return synthesized_answer
+

From 29bb18833ae7a44ee8466963f9af7dd1885078c7 Mon Sep 17 00:00:00 2001
From: Syed Hussain Ather <shussainather@gmail.com>
Date: Mon, 8 Dec 2025 14:41:31 -0500
Subject: [PATCH 5/7] Enrich SciAgent trace with structured hypotheses,
 evidence plan, and methods plan

---
 .../src/statemachines/sciagent_workflow.py    | 121 +++++++++++++++---
 1 file changed, 101 insertions(+), 20 deletions(-)

diff --git a/DeepResearch/src/statemachines/sciagent_workflow.py b/DeepResearch/src/statemachines/sciagent_workflow.py
index d8d2292a..d82742bb 100644
--- a/DeepResearch/src/statemachines/sciagent_workflow.py
+++ b/DeepResearch/src/statemachines/sciagent_workflow.py
@@ -17,6 +17,25 @@ class SciAgentStep:
     metadata: Dict[str, Any]
 
 
+@dataclass
+class Hypothesis:
+    """Structured representation of a single hypothesis."""
+    id: str
+    statement: str
+    rationale: str
+    prior_confidence: float  # 0.0–1.0
+
+
+@dataclass
+class EvidencePlanItem:
+    """Planned evidence source / query."""
+    id: str
+    source: str          # e.g. "PubMed", "GO", "PDB"
+    description: str
+    query: str
+    priority: int        # 1 = highest
+
+
 @dataclass
 class SciAgentTrace:
     """Full trace for one SciAgent run."""
@@ -24,6 +43,9 @@ class SciAgentTrace:
     created_at: str
     config_snapshot: Dict[str, Any]
     steps: List[SciAgentStep]
+    hypotheses: List[Hypothesis]
+    evidence_plan: List[EvidencePlanItem]
+    methods_plan: str
     final_answer: str
 
 
@@ -88,6 +110,7 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
 
     For now this is a lightweight, single-function orchestration that:
       - Builds a structured reasoning trace with placeholder steps
+      - Populates structured hypotheses, evidence plan, and methods plan
       - Optionally writes the trace as a dataset row
       - Returns a human-readable answer string
 
@@ -117,48 +140,103 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
         )
     )
 
-    # --- Step 2: Generate hypotheses (placeholder) ---
-    hypotheses = [
-        "H1: The observed phenomenon is regulated by multiple interacting pathways.",
-        "H2: There exists a dominant mechanism with context-dependent modifiers.",
+    # --- Step 2: Generate structured hypotheses (placeholder content) ---
+    hypotheses: List[Hypothesis] = [
+        Hypothesis(
+            id="H1",
+            statement="The observed phenomenon is regulated by multiple interacting pathways.",
+            rationale="Many biological and physical systems show emergent behavior arising from pathway interactions.",
+            prior_confidence=0.6,
+        ),
+        Hypothesis(
+            id="H2",
+            statement="A single dominant mechanism explains most of the observed variance, with context-dependent modifiers.",
+            rationale="Often one major factor explains the bulk of the effect size, while other modifiers fine-tune outcomes.",
+            prior_confidence=0.4,
+        ),
     ]
+
     steps.append(
         SciAgentStep(
             name="generate_hypotheses",
-            description="Generated a small set of candidate hypotheses.",
+            description="Generated a small set of candidate hypotheses with rationales and prior confidence.",
             metadata={
-                "hypotheses": hypotheses,
+                "hypotheses": [asdict(h) for h in hypotheses],
                 "max_iterations": max_iterations,
             },
         )
     )
 
-    # --- Step 3: Literature / evidence planning (placeholder) ---
+    # --- Step 3: Plan evidence collection (placeholder, but structured) ---
+    evidence_plan: List[EvidencePlanItem] = [
+        EvidencePlanItem(
+            id="E1",
+            source="PubMed",
+            description="Retrieve primary research articles and recent reviews relevant to the question.",
+            query=f'{question} review[pt] OR mechanistic[tiab]',
+            priority=1,
+        ),
+        EvidencePlanItem(
+            id="E2",
+            source="Preprint servers",
+            description="Search preprints for cutting-edge or not-yet-reviewed studies.",
+            query=question,
+            priority=2,
+        ),
+        EvidencePlanItem(
+            id="E3",
+            source="Databases",
+            description="Query structured databases (GO, PDB, etc.) if the question involves genes/proteins.",
+            query="entity-specific queries to GO / PDB / related resources",
+            priority=3,
+        ),
+    ]
+
     steps.append(
         SciAgentStep(
             name="plan_evidence_collection",
-            description="Outlined a plan to gather evidence via DeepSearch / RAG flows.",
+            description="Outlined a plan to gather evidence via DeepSearch / RAG and structured databases.",
             metadata={
-                "planned_sources": [
-                    "PubMed",
-                    "preprint servers",
-                    "review articles",
-                    "databases (GO, PDB, etc.)",
-                ],
+                "evidence_plan": [asdict(e) for e in evidence_plan],
                 "use_deepsearch_flow": True,
                 "use_rag_flow": True,
             },
         )
     )
 
-    # --- Step 4: Synthesis (placeholder) ---
+    # --- Step 4: Methods-style plan (for SFT/DPO later) ---
+    methods_plan = (
+        "1. Formulate the research question precisely and identify key entities.\n"
+        "2. Generate multiple mechanistic hypotheses with explicit rationales and prior confidence.\n"
+        "3. Design an evidence collection strategy using:\n"
+        "   - PubMed for primary literature and reviews\n"
+        "   - Preprint servers for recent, not-yet-reviewed work\n"
+        "   - Structured databases (e.g., GO, PDB) when genes/proteins are involved\n"
+        "4. Retrieve and filter evidence based on study quality, recency, and relevance.\n"
+        "5. Map each piece of evidence to supporting or refuting specific hypotheses.\n"
+        "6. Reweight hypothesis confidence based on the aggregated evidence.\n"
+        "7. Synthesize a critical narrative that explains which hypothesis is best supported and why."
+    )
+
+    steps.append(
+        SciAgentStep(
+            name="construct_methods_plan",
+            description="Defined a methods-style plan for how SciAgent should collect and evaluate evidence.",
+            metadata={
+                "methods_plan": methods_plan,
+            },
+        )
+    )
+
+    # --- Step 5: Synthesis (placeholder text for now) ---
     synthesized_answer = (
-        "SciAgent (minimal stub) analyzed your question and constructed a "
-        "generic scientific reasoning template. The current implementation "
-        "does not yet execute full DeepSearch / PRIME / Bioinformatics flows, "
-        "but it prepares structured hypotheses and an evidence-collection plan "
-        "that future versions will use for verifiable, tool-grounded reasoning."
+        "SciAgent (minimal stub) analyzed your question and constructed a generic scientific "
+        "reasoning template. It generated structured hypotheses, an evidence collection plan, "
+        "and a methods-style evaluation procedure. Future versions will plug this plan into "
+        "tool-grounded flows (DeepSearch, RAG, PRIME, Bioinformatics) to produce fully "
+        "verifiable, data-backed conclusions."
     )
+
     steps.append(
         SciAgentStep(
             name="synthesize_answer",
@@ -175,6 +253,9 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
         created_at=datetime.utcnow().isoformat() + "Z",
         config_snapshot=sci_cfg,
         steps=steps if trace_reasoning else [],
+        hypotheses=hypotheses,
+        evidence_plan=evidence_plan,
+        methods_plan=methods_plan,
         final_answer=synthesized_answer,
     )
 

From 50e4f13c9052108ace972451069a361c1a9be8dc Mon Sep 17 00:00:00 2001
From: Syed Hussain Ather <shussainather@gmail.com>
Date: Mon, 8 Dec 2025 14:54:52 -0500
Subject: [PATCH 6/7] Integrate SciAgent workflow with DeepSearch state machine

---
 .../src/statemachines/sciagent_workflow.py    | 66 +++++++++++++++----
 configs/statemachines/flows/sciagent.yaml     |  1 +
 2 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/DeepResearch/src/statemachines/sciagent_workflow.py b/DeepResearch/src/statemachines/sciagent_workflow.py
index d82742bb..dcb38324 100644
--- a/DeepResearch/src/statemachines/sciagent_workflow.py
+++ b/DeepResearch/src/statemachines/sciagent_workflow.py
@@ -63,6 +63,7 @@ def _get_sciagent_cfg(cfg: Optional[DictConfig]) -> Dict[str, Any]:
           trace_reasoning: true
           generate_datasets: true
           require_verifiable_sources: true
+          use_deepsearch: true
     """
     if cfg is None:
         return {}
@@ -106,16 +107,16 @@ def _maybe_write_dataset(trace: SciAgentTrace, sci_cfg: Dict[str, Any]) -> None:
 
 def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> str:
     """
-    Minimal SciAgent workflow.
+    SciAgent workflow with DeepSearch integration.
 
     For now this is a lightweight, single-function orchestration that:
       - Builds a structured reasoning trace with placeholder steps
       - Populates structured hypotheses, evidence plan, and methods plan
+      - Optionally calls the DeepSearch workflow to gather external evidence
       - Optionally writes the trace as a dataset row
       - Returns a human-readable answer string
 
     This is intentionally simple and safe; future work can:
-      - Call DeepSearch / RAG flows
       - Integrate PRIME / Bioinformatics tools
       - Use Pydantic Graph for multi-node execution
     """
@@ -124,6 +125,7 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
     params = sci_cfg.get("params", {}) if isinstance(sci_cfg, dict) else {}
     max_iterations = params.get("max_iterations", 4)
     trace_reasoning = params.get("trace_reasoning", True)
+    use_deepsearch = params.get("use_deepsearch", True)
 
     steps: List[SciAgentStep] = []
 
@@ -198,13 +200,40 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
             description="Outlined a plan to gather evidence via DeepSearch / RAG and structured databases.",
             metadata={
                 "evidence_plan": [asdict(e) for e in evidence_plan],
-                "use_deepsearch_flow": True,
-                "use_rag_flow": True,
+                "use_deepsearch_flow": use_deepsearch,
             },
         )
     )
 
-    # --- Step 4: Methods-style plan (for SFT/DPO later) ---
+    # --- Optional Step 4: Call DeepSearch workflow ---
+    deepsearch_answer: Optional[str] = None
+    if use_deepsearch:
+        try:
+            # Local import to avoid any potential circular imports at module load time
+            from .deepsearch_workflow import run_deepsearch_workflow
+
+            deepsearch_answer = run_deepsearch_workflow(question, cfg)
+            steps.append(
+                SciAgentStep(
+                    name="deepsearch_execution",
+                    description="Executed DeepSearch workflow to collect and synthesize external evidence.",
+                    metadata={
+                        "deepsearch_answer": deepsearch_answer,
+                    },
+                )
+            )
+        except Exception as e:
+            steps.append(
+                SciAgentStep(
+                    name="deepsearch_error",
+                    description="Attempted to run DeepSearch workflow but it failed.",
+                    metadata={
+                        "error": str(e),
+                    },
+                )
+            )
+
+    # --- Step 5: Methods-style plan (for SFT/DPO later) ---
     methods_plan = (
         "1. Formulate the research question precisely and identify key entities.\n"
         "2. Generate multiple mechanistic hypotheses with explicit rationales and prior confidence.\n"
@@ -228,14 +257,24 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
         )
     )
 
-    # --- Step 5: Synthesis (placeholder text for now) ---
-    synthesized_answer = (
-        "SciAgent (minimal stub) analyzed your question and constructed a generic scientific "
-        "reasoning template. It generated structured hypotheses, an evidence collection plan, "
-        "and a methods-style evaluation procedure. Future versions will plug this plan into "
-        "tool-grounded flows (DeepSearch, RAG, PRIME, Bioinformatics) to produce fully "
-        "verifiable, data-backed conclusions."
-    )
+    # --- Step 6: Synthesis (DeepSearch-aware text for now) ---
+    if deepsearch_answer:
+        synthesized_answer = (
+            "SciAgent used the DeepSearch workflow to gather external evidence and produced the "
+            "following synthesized summary:\n\n"
+            f"{deepsearch_answer}\n\n"
+            "In addition, SciAgent constructed structured hypotheses, an evidence collection plan, "
+            "and a methods-style evaluation procedure that can be reused for dataset generation "
+            "and future tool-grounded reasoning."
+        )
+    else:
+        synthesized_answer = (
+            "SciAgent (minimal stub) analyzed your question and constructed a generic scientific "
+            "reasoning template. It generated structured hypotheses, an evidence collection plan, "
+            "and a methods-style evaluation procedure. Future versions will plug this plan into "
+            "tool-grounded flows (DeepSearch, RAG, PRIME, Bioinformatics) to produce fully "
+            "verifiable, data-backed conclusions."
+        )
 
     steps.append(
         SciAgentStep(
@@ -244,6 +283,7 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
             metadata={
                 "hypotheses_considered": len(hypotheses),
                 "trace_reasoning": trace_reasoning,
+                "used_deepsearch": bool(deepsearch_answer),
             },
         )
     )
diff --git a/configs/statemachines/flows/sciagent.yaml b/configs/statemachines/flows/sciagent.yaml
index bac7f918..90c4dce8 100644
--- a/configs/statemachines/flows/sciagent.yaml
+++ b/configs/statemachines/flows/sciagent.yaml
@@ -11,6 +11,7 @@ params:
   trace_reasoning: true
   generate_datasets: true
   require_verifiable_sources: true
+  use_deepsearch: true
 
 stages:
   parse:

From 2fe49267e3a5273e8f83438b8afb119aab010854 Mon Sep 17 00:00:00 2001
From: Syed Hussain Ather <shussainather@gmail.com>
Date: Mon, 8 Dec 2025 15:02:37 -0500
Subject: [PATCH 7/7] Integrate SciAgent workflow with DeepSearch state machine
 with Bioinformatics/PRIME pipeline

---
 .../src/statemachines/sciagent_workflow.py    | 111 +++++++++++++++---
 configs/statemachines/flows/sciagent.yaml     |   2 +
 2 files changed, 99 insertions(+), 14 deletions(-)

diff --git a/DeepResearch/src/statemachines/sciagent_workflow.py b/DeepResearch/src/statemachines/sciagent_workflow.py
index dcb38324..9304211e 100644
--- a/DeepResearch/src/statemachines/sciagent_workflow.py
+++ b/DeepResearch/src/statemachines/sciagent_workflow.py
@@ -64,6 +64,8 @@ def _get_sciagent_cfg(cfg: Optional[DictConfig]) -> Dict[str, Any]:
           generate_datasets: true
           require_verifiable_sources: true
           use_deepsearch: true
+          use_prime: false
+          use_bioinformatics: false
     """
     if cfg is None:
         return {}
@@ -107,18 +109,19 @@ def _maybe_write_dataset(trace: SciAgentTrace, sci_cfg: Dict[str, Any]) -> None:
 
 def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> str:
     """
-    SciAgent workflow with DeepSearch integration.
+    SciAgent workflow with DeepSearch + PRIME + Bioinformatics integration.
 
     For now this is a lightweight, single-function orchestration that:
       - Builds a structured reasoning trace with placeholder steps
       - Populates structured hypotheses, evidence plan, and methods plan
-      - Optionally calls the DeepSearch workflow to gather external evidence
+      - Optionally calls DeepSearch, PRIME, and Bioinformatics workflows
       - Optionally writes the trace as a dataset row
       - Returns a human-readable answer string
 
     This is intentionally simple and safe; future work can:
-      - Integrate PRIME / Bioinformatics tools
+      - Add smarter domain routing (e.g. automatic protein question detection)
       - Use Pydantic Graph for multi-node execution
+      - Feed tool-grounded results back into hypothesis scoring
     """
     sci_cfg = _get_sciagent_cfg(cfg)
 
@@ -126,6 +129,8 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
     max_iterations = params.get("max_iterations", 4)
     trace_reasoning = params.get("trace_reasoning", True)
     use_deepsearch = params.get("use_deepsearch", True)
+    use_prime = params.get("use_prime", False)
+    use_bioinformatics = params.get("use_bioinformatics", False)
 
     steps: List[SciAgentStep] = []
 
@@ -201,6 +206,8 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
             metadata={
                 "evidence_plan": [asdict(e) for e in evidence_plan],
                 "use_deepsearch_flow": use_deepsearch,
+                "use_prime_flow": use_prime,
+                "use_bioinformatics_flow": use_bioinformatics,
             },
         )
     )
@@ -209,7 +216,7 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
     deepsearch_answer: Optional[str] = None
     if use_deepsearch:
         try:
-            # Local import to avoid any potential circular imports at module load time
+            # Local import to avoid circular imports at module load time
             from .deepsearch_workflow import run_deepsearch_workflow
 
             deepsearch_answer = run_deepsearch_workflow(question, cfg)
@@ -233,7 +240,61 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
                 )
             )
 
-    # --- Step 5: Methods-style plan (for SFT/DPO later) ---
+    # --- Optional Step 5: Call PRIME workflow (protein engineering) ---
+    prime_answer: Optional[str] = None
+    if use_prime:
+        try:
+            from .prime_workflow import run_prime_workflow
+
+            prime_answer = run_prime_workflow(question, cfg)
+            steps.append(
+                SciAgentStep(
+                    name="prime_execution",
+                    description="Executed PRIME flow for protein engineering / structural reasoning.",
+                    metadata={
+                        "prime_answer": prime_answer,
+                    },
+                )
+            )
+        except Exception as e:
+            steps.append(
+                SciAgentStep(
+                    name="prime_error",
+                    description="Attempted to run PRIME workflow but it failed.",
+                    metadata={
+                        "error": str(e),
+                    },
+                )
+            )
+
+    # --- Optional Step 6: Call Bioinformatics workflow (data fusion) ---
+    bio_answer: Optional[str] = None
+    if use_bioinformatics:
+        try:
+            from .bioinformatics_workflow import run_bioinformatics_workflow
+
+            bio_answer = run_bioinformatics_workflow(question, cfg)
+            steps.append(
+                SciAgentStep(
+                    name="bioinformatics_execution",
+                    description="Executed Bioinformatics flow for multi-source data fusion and integrative reasoning.",
+                    metadata={
+                        "bioinformatics_answer": bio_answer,
+                    },
+                )
+            )
+        except Exception as e:
+            steps.append(
+                SciAgentStep(
+                    name="bioinformatics_error",
+                    description="Attempted to run Bioinformatics workflow but it failed.",
+                    metadata={
+                        "error": str(e),
+                    },
+                )
+            )
+
+    # --- Step 7: Methods-style plan (for SFT/DPO later) ---
     methods_plan = (
         "1. Formulate the research question precisely and identify key entities.\n"
         "2. Generate multiple mechanistic hypotheses with explicit rationales and prior confidence.\n"
@@ -257,33 +318,55 @@ def run_sciagent_workflow(question: str, cfg: Optional[DictConfig] = None) -> st
         )
     )
 
-    # --- Step 6: Synthesis (DeepSearch-aware text for now) ---
+    # --- Step 8: Synthesis (aggregate DeepSearch/PRIME/Bio if present) ---
+    synthesis_chunks: List[str] = []
+
     if deepsearch_answer:
+        synthesis_chunks.append(
+            "DeepSearch evidence synthesis:\n"
+            f"{deepsearch_answer}"
+        )
+
+    if prime_answer:
+        synthesis_chunks.append(
+            "PRIME (protein engineering) synthesis:\n"
+            f"{prime_answer}"
+        )
+
+    if bio_answer:
+        synthesis_chunks.append(
+            "Bioinformatics (multi-source data fusion) synthesis:\n"
+            f"{bio_answer}"
+        )
+
+    if synthesis_chunks:
         synthesized_answer = (
-            "SciAgent used the DeepSearch workflow to gather external evidence and produced the "
-            "following synthesized summary:\n\n"
-            f"{deepsearch_answer}\n\n"
-            "In addition, SciAgent constructed structured hypotheses, an evidence collection plan, "
-            "and a methods-style evaluation procedure that can be reused for dataset generation "
-            "and future tool-grounded reasoning."
+            "SciAgent orchestrated multiple domain flows (where enabled) and "
+            "combined their outputs into a unified scientific summary.\n\n"
+            + "\n\n".join(synthesis_chunks)
+            + "\n\nIn addition, SciAgent constructed structured hypotheses, an "
+              "evidence collection plan, and a methods-style evaluation procedure "
+              "that can be reused for dataset generation and future tool-grounded reasoning."
         )
     else:
         synthesized_answer = (
             "SciAgent (minimal stub) analyzed your question and constructed a generic scientific "
             "reasoning template. It generated structured hypotheses, an evidence collection plan, "
             "and a methods-style evaluation procedure. Future versions will plug this plan into "
-            "tool-grounded flows (DeepSearch, RAG, PRIME, Bioinformatics) to produce fully "
+            "tool-grounded flows (DeepSearch, PRIME, Bioinformatics) to produce fully "
             "verifiable, data-backed conclusions."
         )
 
     steps.append(
         SciAgentStep(
             name="synthesize_answer",
-            description="Synthesized a high-level answer based on the placeholder pipeline.",
+            description="Synthesized a high-level answer based on the configured flows and placeholder pipeline.",
             metadata={
                 "hypotheses_considered": len(hypotheses),
                 "trace_reasoning": trace_reasoning,
                 "used_deepsearch": bool(deepsearch_answer),
+                "used_prime": bool(prime_answer),
+                "used_bioinformatics": bool(bio_answer),
             },
         )
     )
diff --git a/configs/statemachines/flows/sciagent.yaml b/configs/statemachines/flows/sciagent.yaml
index 90c4dce8..da6ff40e 100644
--- a/configs/statemachines/flows/sciagent.yaml
+++ b/configs/statemachines/flows/sciagent.yaml
@@ -12,6 +12,8 @@ params:
   generate_datasets: true
   require_verifiable_sources: true
   use_deepsearch: true
+  use_prime: false
+  use_bioinformatics: false
 
 stages:
   parse: