Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
764532e
feat: add fts support
egolearner May 15, 2026
288ea80
fix mac compile & ci
egolearner May 15, 2026
0704718
refactor parse fts & add fts debug text
egolearner May 18, 2026
784e5fd
fix some problems
egolearner May 18, 2026
7ae49af
refactor(fts_column): reorganize into tokenizer/, posting/, iterator/…
egolearner May 19, 2026
84dd52a
perf: or use multi_get
egolearner May 20, 2026
22a612f
perf: optimize disjunction iterator
egolearner May 20, 2026
3a05a15
perf: fts use hashskiplist
egolearner May 20, 2026
a7da75e
refactor batch_get_postings
egolearner May 20, 2026
12a8d56
perf: optimize iterator virtual function
egolearner May 21, 2026
34740d1
bench limit max_queries
egolearner May 21, 2026
ab52311
perf: use PinnableSlice
egolearner May 21, 2026
14882eb
perf: bitpacked avx2
egolearner May 21, 2026
3badf49
chore: rm unnecessary checkpoint
egolearner May 21, 2026
ca5808e
perf: cache block_max_info_for result to skip repeated binary searche…
egolearner May 21, 2026
bbb74ae
perf: precompute BM25 IDF weight per term to eliminate log() from sco…
egolearner May 21, 2026
5ea99ff
perf: cache SIMD dispatch function pointers in iterator to eliminate …
egolearner May 21, 2026
d001175
rename
egolearner May 21, 2026
713b200
perf: push filter down into FTS composite iterators
egolearner May 21, 2026
3919414
refactor: drop block-max helpers superseded by block_max_info_for
egolearner May 22, 2026
647e0ff
perf: candidate-driven (brute-force) FTS evaluation
egolearner May 22, 2026
04cb8f6
PartialMerge no optimize
egolearner May 22, 2026
2739620
fix fts score
egolearner May 22, 2026
122fb51
python binding support fts
egolearner May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Auto-generated files — collapsed in GitHub PR diffs
src/db/index/column/fts_column/gen/** linguist-generated=true
src/db/sqlengine/antlr/gen/** linguist-generated=true
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,12 @@
[submodule "thirdparty/RaBitQ-Library/RaBitQ-Library-0.1"]
path = thirdparty/RaBitQ-Library/RaBitQ-Library-0.1
url = https://github.com/VectorDB-NTU/RaBitQ-Library.git
[submodule "thirdparty/cppjieba/cppjieba-5.6.7"]
path = thirdparty/cppjieba/cppjieba-5.6.7
url = https://github.com/yanyiwu/cppjieba.git
[submodule "thirdparty/FastPFOR/FastPFOR-0.4.0"]
path = thirdparty/FastPFOR/FastPFOR-0.4.0
url = https://github.com/fast-pack/FastPFOR.git
[submodule "thirdparty/limonp/limonp-v1.0.2"]
path = thirdparty/limonp/limonp-v1.0.2
url = https://github.com/yanyiwu/limonp.git
158 changes: 158 additions & 0 deletions python/tests/test_fts_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Copyright 2025-present the zvec project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for FTS (Full-Text Search) query support in the Python SDK."""

import pickle

import pytest

from zvec.model.param.query import Fts, Query
Copy link
Copy Markdown
Collaborator

@JalinWang JalinWang May 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This naming "Fts" is a little bit too generic. Would it be more precise to name it after its underlying dependency, like _FtsQuery (binding) or FtsQueryParam (C++)?



class TestFtsQueryValidation:
"""Test FTS parameter validation in Query dataclass."""

def test_fts_query_string_only(self):
"""Query with only query_string in Fts should be valid."""
q = Query(
field_name="content", fts=Fts(query_string='+hello -world "exact phrase"')
)
q._validate()
assert q.fts.query_string == '+hello -world "exact phrase"'
assert q.fts.match_string is None
assert q.has_fts() is True

def test_fts_match_string_only(self):
"""Query with only match_string in Fts should be valid."""
q = Query(field_name="content", fts=Fts(match_string="machine learning"))
q._validate()
assert q.fts.match_string == "machine learning"
assert q.fts.query_string is None
assert q.has_fts() is True

def test_fts_query_string_and_match_string_mutually_exclusive(self):
"""Cannot provide both query_string and match_string in Fts."""
q = Query(
field_name="content",
fts=Fts(query_string="+hello", match_string="hello world"),
)
with pytest.raises(ValueError, match="mutually exclusive"):
q._validate()

def test_no_fts(self):
"""Query without FTS fields should have has_fts() == False."""
q = Query(field_name="embedding", vector=[0.1, 0.2, 0.3])
assert q.has_fts() is False

def test_vector_and_fts_mutually_exclusive(self):
"""Cannot combine vector search with FTS in a single Query."""
q = Query(
field_name="embedding",
vector=[0.1, 0.2, 0.3],
fts=Fts(match_string="deep learning"),
)
with pytest.raises(ValueError, match="Cannot combine fts with vector search"):
q._validate()

def test_fts_without_vector_or_id(self):
"""Query with only FTS (no vector, no id) should be valid."""
q = Query(field_name="content", fts=Fts(query_string="hello"))
q._validate()
assert q.has_vector() is False
assert q.has_id() is False
assert q.has_fts() is True


class TestFtsQueryBinding:
"""Test FTS binding layer (_FtsQuery)."""

def test_import_fts_query(self):
"""_FtsQuery should be importable from _zvec.param."""
from _zvec.param import _FtsQuery

fts = _FtsQuery()
assert fts.query_string == ""
assert fts.match_string == ""

def test_fts_query_set_fields(self):
"""Setting fields on _FtsQuery should work."""
from _zvec.param import _FtsQuery

fts = _FtsQuery()
fts.query_string = "+hello -world"
assert fts.query_string == "+hello -world"

fts2 = _FtsQuery()
fts2.match_string = "machine learning"
assert fts2.match_string == "machine learning"

def test_fts_query_pickle(self):
"""_FtsQuery should support pickling."""
from _zvec.param import _FtsQuery

fts = _FtsQuery()
fts.query_string = "+vector search"
fts.match_string = ""

data = pickle.dumps(fts)
restored = pickle.loads(data)
assert restored.query_string == "+vector search"
assert restored.match_string == ""

def test_vector_query_fts_field(self):
"""_VectorQuery should have fts_query field."""
from _zvec.param import _FtsQuery, _VectorQuery

vq = _VectorQuery()
# fts_query should be None by default (optional)
assert vq.fts_query is None

# set fts_query
fts = _FtsQuery()
fts.query_string = "hello"
vq.fts_query = fts
assert vq.fts_query is not None
assert vq.fts_query.query_string == "hello"

def test_vector_query_pickle_with_fts(self):
"""_VectorQuery with fts_query should survive pickling."""
from _zvec.param import _FtsQuery, _VectorQuery

vq = _VectorQuery()
vq.topk = 10
vq.field_name = "embedding"
fts = _FtsQuery()
fts.match_string = "test query"
vq.fts_query = fts

data = pickle.dumps(vq)
restored = pickle.loads(data)
assert restored.topk == 10
assert restored.field_name == "embedding"
assert restored.fts_query is not None
assert restored.fts_query.match_string == "test query"

def test_vector_query_pickle_without_fts(self):
"""_VectorQuery without fts_query should survive pickling."""
from _zvec.param import _VectorQuery

vq = _VectorQuery()
vq.topk = 5
vq.field_name = "vec"

data = pickle.dumps(vq)
restored = pickle.loads(data)
assert restored.topk == 5
assert restored.field_name == "vec"
assert restored.fts_query is None
5 changes: 4 additions & 1 deletion python/zvec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,14 @@
from .model.doc import Doc

# —— Query & index parameters ——
# —— FTS params (C++ binding) ——
from .model.param import (
AddColumnOption,
AlterColumnOption,
CollectionOption,
FlatIndexParam,
FtsIndexParam,
FtsQueryParam,
HnswIndexParam,
HnswQueryParam,
HnswRabitqIndexParam,
Expand All @@ -73,7 +76,7 @@
VamanaIndexParam,
VamanaQueryParam,
)
from .model.param.query import Query, VectorQuery
from .model.param.query import Fts, Query, VectorQuery

# —— Schema & field definitions ——
from .model.schema import CollectionSchema, CollectionStats, FieldSchema, VectorSchema
Expand Down
27 changes: 22 additions & 5 deletions python/zvec/executor/query_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import numpy as np
from _zvec import _Collection
from _zvec.param import _VectorQuery
from _zvec.param import _FtsQuery, _VectorQuery

from ..extension import ReRanker, RrfReRanker, WeightedReRanker
from ..model.convert import convert_to_py_doc
Expand Down Expand Up @@ -141,6 +141,14 @@ def _do_build_query_wo_vector(self, ctx: QueryContext) -> _VectorQuery:
core_vector.output_fields = ctx.output_fields
return core_vector

def _do_build_fts_query(self, query: Query, core_vector: _VectorQuery) -> None:
"""Set FTS query on core_vector if the query has FTS parameters."""
if query.has_fts():
fts = _FtsQuery()
fts.query_string = query.fts.query_string or ""
fts.match_string = query.fts.match_string or ""
core_vector.fts_query = fts

def _do_build_query_with_vector(
self, ctx: QueryContext, query: Query, collection: _Collection
) -> _VectorQuery:
Expand All @@ -149,25 +157,34 @@ def _do_build_query_with_vector(
if query.param:
core_vector.query_params = query.param

# set FTS query if provided
self._do_build_fts_query(query, core_vector)

# set output_fields
core_vector.output_fields = ctx.output_fields

# FTS-only query (no vector, no id) — skip vector resolution
if query.has_fts() and not query.has_vector() and not query.has_id():
return core_vector

vector_schema = (
self._schema.vector(query.field_name) if query else self._schema.vectors[0]
)

if vector_schema is None:
raise ValueError("No vector field found")

# set output_fields
core_vector.output_fields = ctx.output_fields

# set vector
if query.has_vector():
vec_data = query.vector
else:
elif query.has_id():
fetched = collection.Fetch([query.id])
doc = next(iter(fetched.values()))
if not doc:
return core_vector
vec_data = doc.get_any(vector_schema.name, vector_schema.data_type)
else:
return core_vector

target_dtype = DTYPE_MAP.get(vector_schema.data_type.value)
core_vector.set_vector(
Expand Down
3 changes: 2 additions & 1 deletion python/zvec/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from .collection import Collection
from .doc import Doc
from .param.query import Query, VectorQuery
from .param.query import Fts, Query, VectorQuery
from .schema.collection_schema import CollectionSchema
from .schema.field_schema import FieldSchema

Expand All @@ -24,6 +24,7 @@
"CollectionSchema",
"Doc",
"FieldSchema",
"Fts",
"Query",
"VectorQuery",
]
4 changes: 4 additions & 0 deletions python/zvec/model/param/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
AlterColumnOption,
CollectionOption,
FlatIndexParam,
FtsIndexParam,
FtsQueryParam,
HnswIndexParam,
HnswQueryParam,
HnswRabitqIndexParam,
Expand All @@ -36,6 +38,8 @@
"AlterColumnOption",
"CollectionOption",
"FlatIndexParam",
"FtsIndexParam",
"FtsQueryParam",
"HnswIndexParam",
"HnswQueryParam",
"HnswRabitqIndexParam",
Expand Down
57 changes: 50 additions & 7 deletions python/zvec/model/param/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,42 @@
from ...common import VectorType
from . import HnswQueryParam, HnswRabitqQueryParam, IVFQueryParam

__all__ = ["Query", "VectorQuery"]
__all__ = ["Fts", "Query", "VectorQuery"]


@dataclass(frozen=True)
class Fts:
"""Full-text search query parameters.

Attributes:
query_string (Optional[str]): FTS query expression
(e.g. '+vector -slow "exact phrase"'). Mutually exclusive with match_string.
match_string (Optional[str]): Natural language match string,
tokenized and combined using the default operator.
Mutually exclusive with query_string.
"""

query_string: Optional[str] = None
match_string: Optional[str] = None


@dataclass(frozen=True)
class Query:
"""Represents a search query for a specific field in a collection.

A `Query` can be constructed using either a document ID (to look up
its vector) or an explicit vector. It may optionally include index-specific
query parameters to control search behavior (e.g., `ef` for HNSW, `nprobe` for IVF).
A `Query` can be constructed for either vector search or full-text search,
but not both simultaneously.

Exactly one of `id` or `vector` should be provided. If both are given,
behavior is implementation-defined (typically `id` takes precedence).
For vector search, provide `id` or `vector` (and optionally `param`).
For FTS, provide `fts`.

Attributes:
field_name (str): Name of the field to query.
id (Optional[str], optional): Document ID to fetch vector from. Default is None.
vector (VectorType, optional): Explicit query vector. Default is None.
param (Optional[Union[HnswQueryParam, IVFQueryParam]], optional):
Index-specific query parameters. Default is None.
Index-specific query parameters for vector search. Default is None.
fts (Optional[Fts], optional): Full-text search parameters. Default is None.

Examples:
>>> import zvec
Expand All @@ -51,12 +67,18 @@ class Query:
... vector=[0.1, 0.2, 0.3],
... param=HnswQueryParam(ef=300)
... )
>>> # FTS query
>>> q3 = zvec.Query(
... field_name="content",
... fts=Fts(match_string="machine learning")
... )
"""

field_name: str
id: Optional[str] = None
vector: VectorType = None
param: Optional[Union[HnswQueryParam, HnswRabitqQueryParam, IVFQueryParam]] = None
fts: Optional[Fts] = None

def has_id(self) -> bool:
"""Check if the query is based on a document ID.
Expand All @@ -74,11 +96,32 @@ def has_vector(self) -> bool:
"""
return self.vector is not None and len(self.vector) > 0

def has_fts(self) -> bool:
"""Check if the query contains an FTS (full-text search) condition.

Returns:
bool: True if `fts` is set with a query_string or match_string.
"""
if self.fts is not None:
return bool(self.fts.query_string) or bool(self.fts.match_string)
return False

def _validate(self) -> None:
if self.field_name is None:
raise ValueError("Field name cannot be empty")
if self.id and self.vector:
raise ValueError("Cannot provide both id and vector")
if self.has_fts() and (
self.has_vector() or self.has_id() or self.param is not None
):
raise ValueError(
"Cannot combine fts with vector search fields (id/vector/param) in a single Query"
)
if self.fts is not None and self.fts.query_string and self.fts.match_string:
raise ValueError(
"Cannot provide both query_string and match_string in Fts; "
"they are mutually exclusive"
)


class VectorQuery(Query):
Expand Down
Loading
Loading