From 901e69af89f24f6bf8379ecb9e59185a69cbb95b Mon Sep 17 00:00:00 2001 From: jiebei Date: Sun, 29 Mar 2026 15:13:15 +0800 Subject: [PATCH] Add XML declaration header to GraphML snapshot output Fix graph.graphml output to include the XML declaration header () required by the XML specification, ensuring the generated file can be correctly imported by Gephi and other GraphML-consuming tools. --- .../patch-20260329050537886994.json | 4 + .../index/operations/snapshot_graphml.py | 4 +- tests/unit/indexing/test_snapshot_graphml.py | 96 +++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 .semversioner/next-release/patch-20260329050537886994.json create mode 100644 tests/unit/indexing/test_snapshot_graphml.py diff --git a/.semversioner/next-release/patch-20260329050537886994.json b/.semversioner/next-release/patch-20260329050537886994.json new file mode 100644 index 0000000000..2a28daea11 --- /dev/null +++ b/.semversioner/next-release/patch-20260329050537886994.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Add XML declaration header to GraphML snapshot for Gephi compatibility" +} diff --git a/packages/graphrag/graphrag/index/operations/snapshot_graphml.py b/packages/graphrag/graphrag/index/operations/snapshot_graphml.py index 16b2d1d574..f7ea218f2f 100644 --- a/packages/graphrag/graphrag/index/operations/snapshot_graphml.py +++ b/packages/graphrag/graphrag/index/operations/snapshot_graphml.py @@ -7,6 +7,8 @@ import pandas as pd from graphrag_storage import Storage +_XML_HEADER = '' + async def snapshot_graphml( edges: pd.DataFrame, @@ -15,5 +17,5 @@ async def snapshot_graphml( ) -> None: """Take a entire snapshot of a graph to standard graphml format.""" graph = nx.from_pandas_edgelist(edges, edge_attr=["weight"]) - graphml = "\n".join(nx.generate_graphml(graph)) + graphml = _XML_HEADER + "\n" + "\n".join(nx.generate_graphml(graph)) await storage.set(name + ".graphml", graphml) diff --git a/tests/unit/indexing/test_snapshot_graphml.py b/tests/unit/indexing/test_snapshot_graphml.py new file mode 100644 index 0000000000..3b8e7e0fed --- /dev/null +++ b/tests/unit/indexing/test_snapshot_graphml.py @@ -0,0 +1,96 @@ +# Copyright (C) 2026 Microsoft +# Licensed under the MIT License + +"""Tests for snapshot_graphml operation.""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any + +import pandas as pd +from graphrag.index.operations.snapshot_graphml import ( + _XML_HEADER, + snapshot_graphml, +) +from graphrag_storage import Storage + +if TYPE_CHECKING: + import re + from collections.abc import Iterator + + +class FakeStorage(Storage): + """In-memory storage for testing.""" + + def __init__(self, **kwargs: Any) -> None: + self.data: dict[str, str] = {} + + async def set(self, key: str, value: Any, encoding: str | None = None) -> None: + self.data[key] = value + + def find(self, file_pattern: re.Pattern[str]) -> Iterator[str]: + yield from [] + + async def get( + self, key: str, as_bytes: bool | None = None, encoding: str | None = None + ) -> Any: + return self.data.get(key) + + async def has(self, key: str) -> bool: + return key in self.data + + async def delete(self, key: str) -> None: + self.data.pop(key, None) + + async def clear(self) -> None: + self.data.clear() + + def child(self, name: str | None) -> FakeStorage: + return FakeStorage() + + def keys(self) -> list[str]: + return list(self.data.keys()) + + async def get_creation_date(self, key: str) -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %z") + + +def _make_edges( + rows: list[tuple[str, str, float]], +) -> pd.DataFrame: + """Build a minimal relationships DataFrame.""" + return pd.DataFrame(rows, columns=["source", "target", "weight"]) + + +class TestSnapshotGraphml: + """Tests for the snapshot_graphml function.""" + + async def test_output_has_xml_header(self) -> None: + """GraphML output must start with the XML declaration.""" + edges = _make_edges([("A", "B", 1.0)]) + storage = FakeStorage() + await snapshot_graphml(edges, "graph", storage) + + content = storage.data["graph.graphml"] + assert content.startswith(_XML_HEADER) + + async def test_output_is_valid_graphml(self) -> None: + """Output should contain root element.""" + edges = _make_edges([("A", "B", 1.0)]) + storage = FakeStorage() + await snapshot_graphml(edges, "graph", storage) + + content = storage.data["graph.graphml"] + assert "" in content + + async def test_empty_graph(self) -> None: + """Empty edge list should produce valid GraphML with XML header.""" + edges = _make_edges([]) + storage = FakeStorage() + await snapshot_graphml(edges, "graph", storage) + + content = storage.data["graph.graphml"] + assert content.startswith(_XML_HEADER) + assert "