From 5b28af9e629929f1fadac66e41ee73e51dc83949 Mon Sep 17 00:00:00 2001 From: Razvan Burlacioiu Date: Tue, 9 Jun 2026 09:50:45 +0300 Subject: [PATCH] feat: create vector indices without training --- .../python/tests/test_create_empty_index.py | 91 +++++++++++++++---- python/python/tests/test_vector_index.py | 22 ++--- rust/lance/src/index.rs | 35 ++++++- rust/lance/src/index/create.rs | 2 - 4 files changed, 113 insertions(+), 37 deletions(-) diff --git a/python/python/tests/test_create_empty_index.py b/python/python/tests/test_create_empty_index.py index 77d4ab034c9..e813a838926 100644 --- a/python/python/tests/test_create_empty_index.py +++ b/python/python/tests/test_create_empty_index.py @@ -3,25 +3,43 @@ """Tests for creating empty indices with train=False.""" +import pytest + import lance import pyarrow as pa import pyarrow.compute as pc -def test_create_empty_scalar_index(): +@pytest.mark.parametrize("index_type", ["BTREE", "BITMAP"]) +def test_create_empty_scalar_index(index_type): data = pa.table({"id": range(100)}) dataset = lance.write_dataset(data, "memory://") - # Passing train=False to create an empty index - dataset.create_scalar_index("id", "BTREE", train=False) + dataset.create_scalar_index("id", index_type, train=False) - # Verify index exists and has correct stats indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0].index_type == "BTree" stats = dataset.stats.index_stats(indices[0].name) assert stats["num_indexed_rows"] == 0 assert stats["num_unindexed_rows"] == dataset.count_rows() + assert len(stats["indices"]) == 1 + assert stats["indices"][0]["num_rows"] == 0 + assert stats["indices"][0]["index_type"].upper().startswith(index_type[:4]) + + +def test_create_empty_fts_index(): + data = pa.table({"text": ["hello world", "foo bar", "lance db"]}) + dataset = lance.write_dataset(data, "memory://") + + dataset.create_scalar_index("text", "FTS", train=False) + + indices = dataset.describe_indices() + assert len(indices) == 1 + stats = dataset.stats.index_stats(indices[0].name) + assert stats["num_indexed_rows"] == 0 + assert stats["num_unindexed_rows"] == dataset.count_rows() + assert len(stats["indices"]) == 1 + assert stats["indices"][0]["num_rows"] == 0 def test_create_empty_vector_index(): @@ -31,17 +49,52 @@ def test_create_empty_vector_index(): data = pa.table({"vector": vectors}) dataset = lance.write_dataset(data, "memory://") - # Currently, vector indices with train=False are not supported - try: - dataset.create_index( - "vector", "IVF_PQ", num_partitions=10, num_sub_vectors=8, train=False - ) - # If we get here, the implementation has been added (unexpected for now) - assert False, ( - "Expected NotImplementedError for train=False on vector index, " - "but succeeded" - ) - except NotImplementedError as e: - # Expected error for unimplemented functionality - error_msg = str(e).lower() - assert "not yet implemented" in error_msg or "not implemented" in error_msg + dataset.create_index( + "vector", "IVF_PQ", num_partitions=10, num_sub_vectors=8, train=False + ) + + indices = dataset.describe_indices() + assert len(indices) == 1 + stats = dataset.stats.index_stats(indices[0].name) + assert stats["num_indexed_rows"] == 0 + assert stats["num_unindexed_rows"] == dataset.count_rows() + assert len(stats["indices"]) == 1 + assert stats["indices"][0]["num_rows"] == 0 + assert stats["indices"][0]["index_type"] == "Vector" + + +def test_create_empty_index_with_name(): + data = pa.table({"id": range(100)}) + dataset = lance.write_dataset(data, "memory://") + + dataset.create_scalar_index("id", "BTREE", name="my_custom_idx", train=False) + + indices = dataset.describe_indices() + assert len(indices) == 1 + assert indices[0].name == "my_custom_idx" + stats = dataset.stats.index_stats("my_custom_idx") + assert stats["num_indexed_rows"] == 0 + assert stats["num_unindexed_rows"] == dataset.count_rows() + + +def test_create_multiple_empty_indices(): + dim = 32 + values = pc.random(50 * dim).cast(pa.float32()) + vectors = pa.FixedSizeListArray.from_arrays(values, dim) + data = pa.table({"id": range(50), "text": ["a"] * 50, "vector": vectors}) + dataset = lance.write_dataset(data, "memory://") + + dataset.create_scalar_index("id", "BTREE", train=False) + dataset.create_scalar_index("text", "FTS", train=False) + dataset.create_index( + "vector", "IVF_PQ", num_partitions=5, num_sub_vectors=8, train=False + ) + + indices = dataset.describe_indices() + assert len(indices) == 3 + for idx in indices: + stats = dataset.stats.index_stats(idx.name) + assert stats["num_indexed_rows"] == 0 + assert stats["num_unindexed_rows"] == dataset.count_rows() + assert len(stats["indices"]) == 1 + assert stats["indices"][0]["num_rows"] == 0 diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 292b8079706..242b91d5e13 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -1017,18 +1017,16 @@ def test_create_ivf_rq_index(): stats = ds.stats.index_stats("vector_idx") assert stats["indices"][0]["sub_index"]["packed"] is True - with pytest.raises( - NotImplementedError, - match="Creating empty vector indices with train=False is not yet implemented", - ): - ds.delete("id>=0") - ds = ds.create_index( - "vector", - index_type="IVF_RQ", - num_partitions=4, - num_bits=1, - replace=True, - ) + ds.delete("id>=0") + ds = ds.create_index( + "vector", + index_type="IVF_RQ", + num_partitions=4, + num_bits=1, + replace=True, + ) + stats = ds.stats.index_stats("vector_idx") + assert stats["num_indexed_rows"] == 0 zero_vectors = np.zeros((1000, 128)).astype(np.float32).tolist() tbl = pa.Table.from_pydict( diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 8984d507408..051fa33bc43 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1630,6 +1630,32 @@ async fn collect_regular_indices_statistics( } let index_details_wrapper = scalar::IndexDetails(index_details.clone()); + + // Empty indices (definition-only with no files) cannot be opened for stats. + // Emit a placeholder entry so the index still appears in `index_stats`, with + // num_rows = 0 to signal there is no indexed data yet. + let is_empty = meta + .fragment_bitmap + .as_ref() + .map(|bm| bm.is_empty()) + .unwrap_or(false); + if is_empty { + let index_type = if index_details_wrapper.is_vector() { + "Vector".to_string() + } else { + index_details_wrapper + .get_plugin() + .map(|p| p.name().to_string()) + .unwrap_or_else(|_| "Unknown".to_string()) + }; + indices_stats.push(serde_json::json!({ + "index_type": index_type, + "uuid": meta.uuid.to_string(), + "num_rows": 0, + })); + continue; + } + if let Ok(plugin) = index_details_wrapper.get_plugin() && let Some(stats) = plugin .load_statistics(index_store.clone(), index_details.as_ref()) @@ -1832,12 +1858,13 @@ impl DatasetIndexInternalExt for Dataset { .await? .ok_or_else(|| Error::index(format!("Index with id {} does not exist", uuid)))?; - // Check if this is a vector index by looking at the files list - let is_vector_index = if let Some(files) = &index_meta.files { - // If we have file metadata, check if INDEX_FILE_NAME is in the list + // Check if this is a vector index. First check index_details metadata (reliable + // for empty/definition-only indices), then fall back to file-based detection. + let is_vector_index = if let Some(details) = &index_meta.index_details { + scalar::IndexDetails(details.clone()).is_vector() + } else if let Some(files) = &index_meta.files { files.iter().any(|f| f.path == INDEX_FILE_NAME) } else { - // Fall back to file existence check for older indices without file metadata let index_dir = self.indice_files_dir(&index_meta)?; let index_file = index_dir .clone() diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 2b6992e4849..ef8d8dd24db 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -436,8 +436,6 @@ impl<'a> CreateIndexBuilder<'a> { if train { ext.create_index(self.dataset, column, &index_id, self.params) .await?; - } else { - todo!("create empty vector index when train=false"); } // Capture file sizes after vector index creation let index_dir = self.dataset.indices_dir().join(index_id.to_string());