msitarzewski · msitarzewski · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/README.md b/README.md
diff --git a/src/duh/api/routes/ask.py b/src/duh/api/routes/ask.py
@@ -104,7 +104,15 @@ async def _handle_consensus(  # type: ignore[no-untyped-def]
     from duh.cli.app import _run_consensus
 
     use_native_search = config.tools.enabled and config.tools.web_search.native
-    decision, confidence, rigor, dissent, cost, _overview = await _run_consensus(
+    (
+        decision,
+        confidence,
+        rigor,
+        dissent,
+        cost,
+        _overview,
+        _citations,
+    ) = await _run_consensus(
         body.question,
         config,
         pm,
@@ -176,9 +184,15 @@ async def _handle_decompose(body: AskRequest, config, pm) -> AskResponse:  # typ
     if len(subtask_specs) == 1:
         from duh.cli.app import _run_consensus
 
-        decision, confidence, rigor, dissent, cost, _overview = await _run_consensus(
-            body.question, config, pm
-        )
+        (
+            decision,
+            confidence,
+            rigor,
+            dissent,
+            cost,
+            _overview,
+            _citations,
+        ) = await _run_consensus(body.question, config, pm)
         return AskResponse(
             decision=decision,
             confidence=confidence,

diff --git a/src/duh/cli/app.py b/src/duh/cli/app.py
@@ -210,10 +210,12 @@ async def _run_consensus(
     proposer_override: str | None = None,
     challengers_override: list[str] | None = None,
     web_search: bool = False,
-) -> tuple[str, float, float, str | None, float, str | None]:
+) -> tuple[
+    str, float, float, str | None, float, str | None, list[dict[str, str | None]]
+]:
     """Run the full consensus loop.
 
-    Returns (decision, confidence, rigor, dissent, total_cost, overview).
+    Returns (decision, confidence, rigor, dissent, total_cost, overview, citations).
     """
     from duh.consensus.convergence import check_convergence
     from duh.consensus.handlers import (
@@ -332,13 +334,25 @@ async def _run_consensus(
     if display and ctx.tool_calls_log:
         display.show_tool_use(ctx.tool_calls_log)
 
+    # Collect all citations across rounds
+    all_citations: list[dict[str, str | None]] = []
+    for rr in ctx.round_history:
+        all_citations.extend(rr.proposal_citations)
+        for ch in rr.challenges:
+            all_citations.extend(ch.citations)
+    # Include current round (may not be archived yet)
+    all_citations.extend(ctx.proposal_citations)
+    for ch in ctx.challenges:
+        all_citations.extend(ch.citations)
+
     return (
         ctx.decision or "",
         ctx.confidence,
         ctx.rigor,
         ctx.dissent,
         pm.total_cost,
         ctx.overview,
+        all_citations,
     )
 
 
@@ -490,14 +504,15 @@ def ask(
         _error(str(e))
         return  # unreachable
 
-    decision, confidence, rigor, dissent, cost, overview = result
+    decision, confidence, rigor, dissent, cost, overview, citations = result
 
     from duh.cli.display import ConsensusDisplay
 
     display = ConsensusDisplay()
     display.show_final_decision(
         decision, confidence, rigor, cost, dissent, overview=overview
     )
+    display.show_citations(citations)
 
 
 async def _refine_question(question: str, config: DuhConfig) -> str:
@@ -532,7 +547,9 @@ async def _ask_async(
     panel: list[str] | None = None,
     proposer_override: str | None = None,
     challengers_override: list[str] | None = None,
-) -> tuple[str, float, float, str | None, float, str | None]:
+) -> tuple[
+    str, float, float, str | None, float, str | None, list[dict[str, str | None]]
+]:
     """Async implementation for the ask command."""
     from duh.cli.display import ConsensusDisplay
 
@@ -641,12 +658,19 @@ async def _ask_auto_async(
 
         display = ConsensusDisplay()
         display.start()
-        decision, confidence, rigor, dissent, cost, overview = await _run_consensus(
-            question, config, pm, display=display
-        )
+        (
+            decision,
+            confidence,
+            rigor,
+            dissent,
+            cost,
+            overview,
+            citations,
+        ) = await _run_consensus(question, config, pm, display=display)
         display.show_final_decision(
             decision, confidence, rigor, cost, dissent, overview=overview
         )
+        display.show_citations(citations)
 
 
 async def _ask_decompose_async(
@@ -719,10 +743,11 @@ async def _ask_decompose_async(
     # Single-subtask optimization: skip synthesis
     if len(subtask_specs) == 1:
         result = await _run_consensus(question, config, pm, display=display)
-        decision, confidence, rigor, dissent, cost, overview = result
+        decision, confidence, rigor, dissent, cost, overview, citations = result
         display.show_final_decision(
             decision, confidence, rigor, cost, dissent, overview=overview
         )
+        display.show_citations(citations)
         await engine.dispose()
         return
 
@@ -2371,6 +2396,7 @@ async def _batch_async(
                     _dissent,
                     _cost,
                     _overview,
+                    _citations,
                 ) = await _run_consensus(question, config, pm)
 
             q_cost = pm.total_cost - cost_before

diff --git a/src/duh/cli/display.py b/src/duh/cli/display.py
@@ -357,6 +357,61 @@ def show_tool_use(self, tool_calls_log: list[dict[str, str]]) -> None:
             )
         )
 
+    # ── Citations ──────────────────────────────────────────────
+
+    def show_citations(
+        self,
+        citations: Sequence[dict[str, str | None]],
+    ) -> None:
+        """Display deduplicated citations grouped by hostname."""
+        if not citations:
+            return
+
+        from urllib.parse import urlparse
+
+        # Deduplicate by URL
+        seen: set[str] = set()
+        unique: list[dict[str, str | None]] = []
+        for c in citations:
+            url = c.get("url") or ""
+            if url and url not in seen:
+                seen.add(url)
+                unique.append(c)
+
+        if not unique:
+            return
+
+        # Group by hostname
+        groups: dict[str, list[dict[str, str | None]]] = {}
+        for c in unique:
+            url = c.get("url") or ""
+            try:
+                host = urlparse(url).netloc or url
+            except Exception:
+                host = url
+            groups.setdefault(host, []).append(c)
+
+        # Sort groups by count descending
+        sorted_groups = sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)
+
+        parts: list[str] = []
+        idx = 1
+        for host, group in sorted_groups:
+            for c in group:
+                title = c.get("title") or host
+                url = c.get("url") or ""
+                parts.append(f"  [{idx}] {title}\n      {url}")
+                idx += 1
+
+        body = "\n".join(parts)
+        self._console.print(
+            Panel(
+                body,
+                title=f"[bold cyan]Sources[/bold cyan] ({len(unique)})",
+                border_style="cyan",
+            )
+        )
+
     # ── Final output ──────────────────────────────────────────
 
     def show_final_decision(

diff --git a/src/duh/mcp/server.py b/src/duh/mcp/server.py
@@ -135,9 +135,15 @@ async def _handle_ask(args: dict) -> list[TextContent]:  # type: ignore[type-arg
             )
         ]
     else:
-        decision, confidence, rigor, dissent, cost, _overview = await _run_consensus(
-            question, config, pm
-        )
+        (
+            decision,
+            confidence,
+            rigor,
+            dissent,
+            cost,
+            _overview,
+            _citations,
+        ) = await _run_consensus(question, config, pm)
         return [
             TextContent(
                 type="text",

diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
@@ -76,6 +76,7 @@ def test_displays_decision(
             None,
             0.0042,
             None,
+            [],
         )
 
         result = runner.invoke(cli, ["ask", "What database?"])
@@ -103,6 +104,7 @@ def test_displays_dissent(
             "[model-a]: PostgreSQL would be better for scale.",
             0.01,
             None,
+            [],
         )
 
         result = runner.invoke(cli, ["ask", "What database?"])
@@ -123,7 +125,7 @@ def test_no_dissent_when_none(
         from duh.config.schema import DuhConfig
 
         mock_config.return_value = DuhConfig()
-        mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None)
+        mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, [])
 
         result = runner.invoke(cli, ["ask", "Question?"])
 
@@ -142,7 +144,7 @@ def test_rounds_option(
 
         config = DuhConfig()
         mock_config.return_value = config
-        mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None)
+        mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, [])
 
         result = runner.invoke(cli, ["ask", "--rounds", "5", "Question?"])
 

diff --git a/tests/unit/test_cli_batch.py b/tests/unit/test_cli_batch.py
@@ -452,10 +452,18 @@ async def fake_consensus(
             pm: Any,
             display: Any = None,
             tool_registry: Any = None,
-        ) -> tuple[str, float, float, str | None, float, str | None]:
+        ) -> tuple[
+            str,
+            float,
+            float,
+            str | None,
+            float,
+            str | None,
+            list[dict[str, str | None]],
+        ]:
             nonlocal consensus_called
             consensus_called = True
-            return ("Use SQLite.", 0.85, 1.0, None, 0.01, None)
+            return ("Use SQLite.", 0.85, 1.0, None, 0.01, None, [])
 
         with (
             patch("duh.cli.app.load_config", return_value=config),
@@ -547,7 +555,7 @@ async def fake_consensus(
             display: Any = None,
             tool_registry: Any = None,
         ) -> tuple[str, float, float, str | None, float, str | None]:
-            return ("Answer.", 0.9, 1.0, None, 0.01, None)
+            return ("Answer.", 0.9, 1.0, None, 0.01, None, [])
 
         with (
             patch("duh.cli.app.load_config", return_value=config),
@@ -606,7 +614,7 @@ async def fake_consensus(
             call_count += 1
             if question == "Q2":
                 raise RuntimeError("Provider timeout")
-            return ("Answer.", 0.9, 1.0, None, 0.01, None)
+            return ("Answer.", 0.9, 1.0, None, 0.01, None, [])
 
         with (
             patch("duh.cli.app.load_config", return_value=config),
@@ -653,7 +661,7 @@ async def fake_consensus(
         ) -> tuple[str, float, float, str | None, float, str | None]:
             if question == "Q2":
                 raise RuntimeError("Model unavailable")
-            return ("Answer.", 0.9, 1.0, None, 0.01, None)
+            return ("Answer.", 0.9, 1.0, None, 0.01, None, [])
 
         with (
             patch("duh.cli.app.load_config", return_value=config),

diff --git a/tests/unit/test_cli_voting.py b/tests/unit/test_cli_voting.py
@@ -147,7 +147,7 @@ def test_default_protocol_is_consensus(
         from duh.config.schema import DuhConfig
 
         mock_config.return_value = DuhConfig()
-        mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None)
+        mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, [])
 
         result = runner.invoke(cli, ["ask", "Question?"])
         assert result.exit_code == 0

diff --git a/tests/unit/test_mcp_server.py b/tests/unit/test_mcp_server.py
@@ -177,7 +177,7 @@ async def test_consensus_protocol(self) -> None:
             patch(
                 "duh.cli.app._run_consensus",
                 new_callable=AsyncMock,
-                return_value=("Use SQLite.", 0.9, 1.0, "minor dissent", 0.05, None),
+                return_value=("Use SQLite.", 0.9, 1.0, "minor dissent", 0.05, None, []),
             ),
         ):
             result = await _handle_ask({"question": "What DB?", "rounds": 2})