diff --git a/.semversioner/next-release/patch-20260329050537886994.json b/.semversioner/next-release/patch-20260329050537886994.json new file mode 100644 index 0000000000..2a28daea11 --- /dev/null +++ b/.semversioner/next-release/patch-20260329050537886994.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Add XML declaration header to GraphML snapshot for Gephi compatibility" +} diff --git a/packages/graphrag/graphrag/index/operations/snapshot_graphml.py b/packages/graphrag/graphrag/index/operations/snapshot_graphml.py index 16b2d1d574..f7ea218f2f 100644 --- a/packages/graphrag/graphrag/index/operations/snapshot_graphml.py +++ b/packages/graphrag/graphrag/index/operations/snapshot_graphml.py @@ -7,6 +7,8 @@ import pandas as pd from graphrag_storage import Storage +_XML_HEADER = '' + async def snapshot_graphml( edges: pd.DataFrame, @@ -15,5 +17,5 @@ async def snapshot_graphml( ) -> None: """Take a entire snapshot of a graph to standard graphml format.""" graph = nx.from_pandas_edgelist(edges, edge_attr=["weight"]) - graphml = "\n".join(nx.generate_graphml(graph)) + graphml = _XML_HEADER + "\n" + "\n".join(nx.generate_graphml(graph)) await storage.set(name + ".graphml", graphml) diff --git a/tests/unit/indexing/test_snapshot_graphml.py b/tests/unit/indexing/test_snapshot_graphml.py new file mode 100644 index 0000000000..3b8e7e0fed --- /dev/null +++ b/tests/unit/indexing/test_snapshot_graphml.py @@ -0,0 +1,96 @@ +# Copyright (C) 2026 Microsoft +# Licensed under the MIT License + +"""Tests for snapshot_graphml operation.""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any + +import pandas as pd +from graphrag.index.operations.snapshot_graphml import ( + _XML_HEADER, + snapshot_graphml, +) +from graphrag_storage import Storage + +if TYPE_CHECKING: + import re + from collections.abc import Iterator + + +class FakeStorage(Storage): + """In-memory storage for testing.""" + + def __init__(self, **kwargs: Any) -> None: + self.data: dict[str, str] = {} + + async def set(self, key: str, value: Any, encoding: str | None = None) -> None: + self.data[key] = value + + def find(self, file_pattern: re.Pattern[str]) -> Iterator[str]: + yield from [] + + async def get( + self, key: str, as_bytes: bool | None = None, encoding: str | None = None + ) -> Any: + return self.data.get(key) + + async def has(self, key: str) -> bool: + return key in self.data + + async def delete(self, key: str) -> None: + self.data.pop(key, None) + + async def clear(self) -> None: + self.data.clear() + + def child(self, name: str | None) -> FakeStorage: + return FakeStorage() + + def keys(self) -> list[str]: + return list(self.data.keys()) + + async def get_creation_date(self, key: str) -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %z") + + +def _make_edges( + rows: list[tuple[str, str, float]], +) -> pd.DataFrame: + """Build a minimal relationships DataFrame.""" + return pd.DataFrame(rows, columns=["source", "target", "weight"]) + + +class TestSnapshotGraphml: + """Tests for the snapshot_graphml function.""" + + async def test_output_has_xml_header(self) -> None: + """GraphML output must start with the XML declaration.""" + edges = _make_edges([("A", "B", 1.0)]) + storage = FakeStorage() + await snapshot_graphml(edges, "graph", storage) + + content = storage.data["graph.graphml"] + assert content.startswith(_XML_HEADER) + + async def test_output_is_valid_graphml(self) -> None: + """Output should contain root element.""" + edges = _make_edges([("A", "B", 1.0)]) + storage = FakeStorage() + await snapshot_graphml(edges, "graph", storage) + + content = storage.data["graph.graphml"] + assert "" in content + + async def test_empty_graph(self) -> None: + """Empty edge list should produce valid GraphML with XML header.""" + edges = _make_edges([]) + storage = FakeStorage() + await snapshot_graphml(edges, "graph", storage) + + content = storage.data["graph.graphml"] + assert content.startswith(_XML_HEADER) + assert "