Skip to content

Commit e433625

Browse files
authored
feat: expose CypherEngine in Python for multi-query execution (#124)
Currently, every call of `query.execute()` needs to build the catalog, which is an expensive operation. This PR exposes a `CypherEngine` API in Python for multi-query execution, with a reusable catalog. An example usage (also included in the README.md): ```python import pyarrow as pa from lance_graph import CypherEngine, GraphConfig cfg = ( GraphConfig.builder() .with_node_label("Person", "id") .with_node_label("City", "id") .with_relationship("lives_in", "src", "dst") .build() ) datasets = { "Person": pa.table({"id": [1, 2], "name": ["Alice", "Bob"], "age": [30, 25]}), "City": pa.table({"id": [10, 20], "name": ["London", "Sydney"]}), "lives_in": pa.table({"src": [1, 2], "dst": [10, 20]}), } # Create engine once - builds catalog engine = CypherEngine(cfg, datasets) # Execute multiple queries efficiently - catalog is reused result1 = engine.execute("MATCH (p:Person) WHERE p.age > 25 RETURN p.name") result2 = engine.execute("MATCH (p:Person)-[:lives_in]->(c:City) RETURN p.name, c.name") result3 = engine.execute("MATCH (p:Person) RETURN count(*) as total") print(result1.to_pylist()) # [{'p.name': 'Alice'}] ```
1 parent 624f930 commit e433625

7 files changed

Lines changed: 406 additions & 3 deletions

File tree

python/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

python/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ arrow = { version = "56.2", features = ["pyarrow"] }
1414
arrow-array = "56.2"
1515
arrow-schema = "56.2"
1616
arrow-ipc = "56.2"
17+
datafusion = { version = "50.3", default-features = false }
1718
futures = "0.3"
1819
lance-graph = { path = "../crates/lance-graph" }
1920
serde = { version = "1", features = ["derive"] }

python/README.md

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,41 @@ print(result.to_pylist())
4848
[{'p.name': 'Alice', 'c.name': 'London'}, {'p.name': 'Bob', 'c.name': 'Sydney'}]
4949
```
5050

51-
### 2. Build a Knowledge Graph from Text
51+
### 2. Multi-Query Execution with CypherEngine
52+
53+
For executing multiple queries against the same datasets, use `CypherEngine` to cache the catalog and achieve better performance:
54+
55+
```python
56+
import pyarrow as pa
57+
from lance_graph import CypherEngine, GraphConfig
58+
59+
cfg = (
60+
GraphConfig.builder()
61+
.with_node_label("Person", "id")
62+
.with_node_label("City", "id")
63+
.with_relationship("lives_in", "src", "dst")
64+
.build()
65+
)
66+
67+
datasets = {
68+
"Person": pa.table({"id": [1, 2], "name": ["Alice", "Bob"], "age": [30, 25]}),
69+
"City": pa.table({"id": [10, 20], "name": ["London", "Sydney"]}),
70+
"lives_in": pa.table({"src": [1, 2], "dst": [10, 20]}),
71+
}
72+
73+
# Create engine once - builds catalog
74+
engine = CypherEngine(cfg, datasets)
75+
76+
# Execute multiple queries efficiently - catalog is reused
77+
result1 = engine.execute("MATCH (p:Person) WHERE p.age > 25 RETURN p.name")
78+
result2 = engine.execute("MATCH (p:Person)-[:lives_in]->(c:City) RETURN p.name, c.name")
79+
result3 = engine.execute("MATCH (p:Person) RETURN count(*) as total")
80+
81+
print(result1.to_pylist())
82+
# [{'p.name': 'Alice'}]
83+
```
84+
85+
### 3. Build a Knowledge Graph from Text
5286

5387
```python
5488
from pathlib import Path
@@ -107,7 +141,7 @@ result = kg.query("""
107141
print(result.to_pylist())
108142
```
109143

110-
### 3. Natural Language Q&A
144+
### 4. Natural Language Q&A
111145

112146
```python
113147
from knowledge_graph.llm.qa import ask_question

python/python/lance_graph/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,19 @@ def _load_dev_build() -> ModuleType:
7171
GraphConfig = _bindings.graph.GraphConfig
7272
GraphConfigBuilder = _bindings.graph.GraphConfigBuilder
7373
CypherQuery = _bindings.graph.CypherQuery
74+
CypherEngine = _bindings.graph.CypherEngine
75+
ExecutionStrategy = _bindings.graph.ExecutionStrategy
7476
VectorSearch = _bindings.graph.VectorSearch
7577
DistanceMetric = _bindings.graph.DistanceMetric
78+
7679
DirNamespace = _bindings.graph.DirNamespace
7780

7881
__all__ = [
7982
"GraphConfig",
8083
"GraphConfigBuilder",
8184
"CypherQuery",
85+
"CypherEngine",
86+
"ExecutionStrategy",
8287
"VectorSearch",
8388
"DistanceMetric",
8489
"DirNamespace",
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright The Lance Authors
3+
4+
import pyarrow as pa
5+
import pytest
6+
from lance_graph import CypherEngine, CypherQuery, GraphConfig
7+
8+
9+
@pytest.fixture
10+
def graph_env():
11+
"""Create sample graph data for testing."""
12+
people_table = pa.table(
13+
{
14+
"person_id": [1, 2, 3, 4],
15+
"name": ["Alice", "Bob", "Carol", "David"],
16+
"age": [28, 34, 29, 42],
17+
"city": ["New York", "San Francisco", "New York", "Chicago"],
18+
}
19+
)
20+
21+
companies_table = pa.table(
22+
{
23+
"company_id": [101, 102, 103],
24+
"company_name": ["TechCorp", "DataInc", "CloudSoft"],
25+
"industry": ["Technology", "Analytics", "Cloud"],
26+
}
27+
)
28+
29+
employment_table = pa.table(
30+
{
31+
"person_id": [1, 2, 3, 4],
32+
"company_id": [101, 101, 102, 103],
33+
"position": ["Engineer", "Designer", "Manager", "Director"],
34+
"salary": [120000, 95000, 130000, 180000],
35+
}
36+
)
37+
38+
friendship_table = pa.table(
39+
{
40+
"person1_id": [1, 1, 2, 3],
41+
"person2_id": [2, 3, 4, 4],
42+
"friendship_type": ["close", "casual", "close", "casual"],
43+
"years_known": [5, 2, 3, 1],
44+
}
45+
)
46+
47+
config = (
48+
GraphConfig.builder()
49+
.with_node_label("Person", "person_id")
50+
.with_node_label("Company", "company_id")
51+
.with_relationship("WORKS_FOR", "person_id", "company_id")
52+
.with_relationship("FRIEND_OF", "person1_id", "person2_id")
53+
.build()
54+
)
55+
56+
datasets = {
57+
"Person": people_table,
58+
"Company": companies_table,
59+
"WORKS_FOR": employment_table,
60+
"FRIEND_OF": friendship_table,
61+
}
62+
63+
return config, datasets
64+
65+
66+
def test_cypher_engine_basic_query(graph_env):
67+
"""Test basic query execution with CypherEngine."""
68+
config, datasets = graph_env
69+
engine = CypherEngine(config, datasets)
70+
71+
result = engine.execute("MATCH (p:Person) RETURN p.name, p.age")
72+
data = result.to_pydict()
73+
74+
assert set(data.keys()) == {"p.name", "p.age"}
75+
assert len(data["p.name"]) == 4
76+
assert "Alice" in set(data["p.name"])
77+
78+
79+
def test_cypher_engine_filtered_query(graph_env):
80+
"""Test filtered query with WHERE clause."""
81+
config, datasets = graph_env
82+
engine = CypherEngine(config, datasets)
83+
84+
result = engine.execute("MATCH (p:Person) WHERE p.age > 30 RETURN p.name, p.age")
85+
data = result.to_pydict()
86+
87+
assert len(data["p.name"]) == 2
88+
assert set(data["p.name"]) == {"Bob", "David"}
89+
assert all(age > 30 for age in data["p.age"])
90+
91+
92+
def test_cypher_engine_relationship_query(graph_env):
93+
"""Test query with relationship traversal."""
94+
config, datasets = graph_env
95+
engine = CypherEngine(config, datasets)
96+
97+
result = engine.execute(
98+
"MATCH (p:Person)-[:WORKS_FOR]->(c:Company) "
99+
"RETURN p.name AS person_name, c.company_name AS company_name"
100+
)
101+
data = result.to_pydict()
102+
103+
assert len(data["person_name"]) == 4
104+
assert "Alice" in data["person_name"]
105+
assert "TechCorp" in data["company_name"]
106+
107+
108+
def test_cypher_engine_multiple_queries(graph_env):
109+
"""Test that catalog is reused across multiple queries."""
110+
config, datasets = graph_env
111+
engine = CypherEngine(config, datasets)
112+
113+
# Execute multiple different queries
114+
result1 = engine.execute("MATCH (p:Person) WHERE p.age > 30 RETURN p.name")
115+
result2 = engine.execute("MATCH (p:Person) WHERE p.city = 'New York' RETURN p.name")
116+
result3 = engine.execute("MATCH (p:Person) RETURN count(*) as total")
117+
118+
data1 = result1.to_pydict()
119+
data2 = result2.to_pydict()
120+
data3 = result3.to_pydict()
121+
122+
assert len(data1["p.name"]) == 2
123+
assert len(data2["p.name"]) == 2
124+
assert data3["total"][0] == 4
125+
126+
127+
def test_cypher_engine_aggregation(graph_env):
128+
"""Test aggregation queries."""
129+
config, datasets = graph_env
130+
engine = CypherEngine(config, datasets)
131+
132+
result = engine.execute(
133+
"MATCH (p:Person) RETURN count(*) as total, avg(p.age) as avg_age"
134+
)
135+
data = result.to_pydict()
136+
137+
assert data["total"][0] == 4
138+
# Average of [28, 34, 29, 42] = 33.25
139+
assert abs(data["avg_age"][0] - 33.25) < 0.01
140+
141+
142+
def test_cypher_engine_vs_cypher_query_equivalence(graph_env):
143+
"""Test that CypherEngine produces same results as CypherQuery."""
144+
config, datasets = graph_env
145+
146+
query_text = "MATCH (p:Person) WHERE p.age > 30 RETURN p.name, p.age ORDER BY p.age"
147+
148+
# Execute with CypherQuery
149+
query = CypherQuery(query_text).with_config(config)
150+
result_query = query.execute(datasets)
151+
152+
# Execute with CypherEngine
153+
engine = CypherEngine(config, datasets)
154+
result_engine = engine.execute(query_text)
155+
156+
# Results should be identical
157+
assert result_query.to_pydict() == result_engine.to_pydict()
158+
159+
160+
def test_cypher_engine_config_access(graph_env):
161+
"""Test that we can access the engine's config."""
162+
config, datasets = graph_env
163+
engine = CypherEngine(config, datasets)
164+
165+
engine_config = engine.config()
166+
167+
assert "person" in engine_config.node_labels() # case-insensitive
168+
assert "company" in engine_config.node_labels()

0 commit comments

Comments
 (0)