From 018ddd0242e4f0fd4cee087fd2f2425c4707e8d9 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Wed, 1 Jul 2026 16:26:28 +0800 Subject: [PATCH] docs(indexing): add rabitq performance guide --- docs/docs.json | 1 + docs/indexing/index.mdx | 2 +- docs/indexing/quantization.mdx | 8 +- docs/indexing/rabitq.mdx | 113 ++++++++++++++++++ docs/snippets/indexing.mdx | 2 + .../indexing/rabitq-approx-mode-latency.svg | 35 ++++++ .../indexing/rabitq-approx-mode-qps.svg | 35 ++++++ .../images/indexing/rabitq-hot-memory.svg | 39 ++++++ .../images/indexing/rabitq-p99-latency.svg | 39 ++++++ .../images/indexing/rabitq-qps-per-core.svg | 39 ++++++ .../assets/images/indexing/rabitq-recall.svg | 39 ++++++ tests/py/test_indexing.py | 23 ++++ 12 files changed, 370 insertions(+), 5 deletions(-) create mode 100644 docs/indexing/rabitq.mdx create mode 100644 docs/static/assets/images/indexing/rabitq-approx-mode-latency.svg create mode 100644 docs/static/assets/images/indexing/rabitq-approx-mode-qps.svg create mode 100644 docs/static/assets/images/indexing/rabitq-hot-memory.svg create mode 100644 docs/static/assets/images/indexing/rabitq-p99-latency.svg create mode 100644 docs/static/assets/images/indexing/rabitq-qps-per-core.svg create mode 100644 docs/static/assets/images/indexing/rabitq-recall.svg diff --git a/docs/docs.json b/docs/docs.json index 87db70bc..0f624475 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -110,6 +110,7 @@ "indexing/scalar-index", "indexing/gpu-indexing", "indexing/quantization", + "indexing/rabitq", "indexing/reindexing" ] }, diff --git a/docs/indexing/index.mdx b/docs/indexing/index.mdx index 815cf5f2..246287ac 100644 --- a/docs/indexing/index.mdx +++ b/docs/indexing/index.mdx @@ -53,7 +53,7 @@ Vector indexes can use different quantization methods to compress vectors and im | :----------- | :------- | :---------- | | `PQ` (Product Quantization) | Default choice for most vector search scenarios. Use when you need to balance index size and recall. | Divides vectors into subvectors and quantizes each subvector independently. Provides a good balance between compression ratio and search accuracy. | | `SQ` (Scalar Quantization) | Use when you need faster indexing or when vector dimensions have consistent value ranges. | Quantizes each dimension independently. Simpler than PQ but typically provides less compression. | -| `RQ` (RabitQ Quantization) | Use when you need maximum compression or have specific per-dimension requirements. | Per-dimension quantization using a RabitQ codebook. Provides fine-grained control over compression per dimension. For `IVF_RQ`, vector dimensions must be divisible by `8`. | +| `RQ` (RaBitQ Quantization) | Use when you need compact high-dimensional vector search with strong recall and low serving memory. | RaBitQ supports a classic 1-bit path and multi-bit quantization for higher recall. For `IVF_RQ`, vector dimensions must be divisible by `8`. See the [RaBitQ indexing guide](/indexing/rabitq) for performance and memory tradeoffs. | | `None/Flat` | Use for binary vectors (with `hamming` distance) or when you need maximum recall and have sufficient storage. | No quantization—stores raw vectors. Provides the highest accuracy but requires more storage and memory. | ## Understanding the IVF-PQ Index diff --git a/docs/indexing/quantization.mdx b/docs/indexing/quantization.mdx index 85d277cb..2da67fb0 100644 --- a/docs/indexing/quantization.mdx +++ b/docs/indexing/quantization.mdx @@ -16,7 +16,7 @@ Use quantization when: LanceDB currently exposes multiple quantized vector index types, including: - `IVF_PQ` -- Inverted File index with Product Quantization (default). See the [vector indexing guide](/indexing/vector-index) for `IVF_PQ` examples. - `IVF_SQ` -- Inverted File index with Scalar Quantization. This is available in Python and Rust; TypeScript does not currently expose `IvfSq`. -- `IVF_RQ` -- Inverted File index with **RaBitQ** quantization (binary, 1 bit per dimension). Requires vector dimensions divisible by `8`. See [below](#rabitq-quantization) for details. +- `IVF_RQ` -- Inverted File index with **RaBitQ** quantization. It supports the classic 1-bit representation and multi-bit quantization for higher recall. Requires vector dimensions divisible by `8`. See [below](#rabitq-quantization) for details. - `IVF_HNSW_SQ` -- IVF partitions with an **HNSW graph per partition** plus **Scalar Quantization**. Strong recall/latency/size trade-off for most workloads. - `IVF_HNSW_PQ` -- IVF partitions with an **HNSW graph per partition** plus **Product Quantization**. Prefer when PQ-level compression matters and you still want HNSW-style in-partition search. @@ -26,7 +26,7 @@ Use the same distance metric when training the index and running queries against ## RaBitQ quantization -RaBitQ is a binary quantization method that represents each normalized embedding using **1 bit per dimension**, plus a couple of small corrective scalars. In practice, a 1,024-dimensional `float32` vector that would normally take 4 KB can be compressed to roughly a few hundred bytes with RaBitQ, while still maintaining reasonable recall. +RaBitQ is a quantization method that can represent each normalized embedding using the classic **1 bit per dimension** layout, plus a couple of small corrective scalars. LanceDB also supports multi-bit RaBitQ through `num_bits`, which stores extra quantized signal for higher recall. In practice, a 1,024-dimensional `float32` vector that would normally take 4 KB can be compressed to roughly a few hundred bytes with RaBitQ, while still maintaining reasonable recall. ### How RaBitQ works @@ -42,7 +42,7 @@ Compared to `IVF_PQ`, RaBitQ: - Builds indexes faster and handles updates more easily - Maintains or improves recall at high dimensionality under the same storage budget -For a deeper dive into the theory and some benchmark results, see the blog post: [LanceDB's RaBitQ Quantization for Blazing Fast Vector Search](https://lancedb.com/blog/feature-rabitq-quantization/). +For a deeper dive into current performance, memory, multi-bit recall, and `approx_mode` tradeoffs, see the [RaBitQ indexing guide](/indexing/rabitq). For the original theory and early benchmark results, see the blog post: [LanceDB's RaBitQ Quantization for Blazing Fast Vector Search](https://lancedb.com/blog/feature-rabitq-quantization/). ### Using RaBitQ @@ -54,7 +54,7 @@ When using `IVF_RQ`, vector dimensions must be divisible by `8`. `num_bits` controls how many bits per dimension are used: -1 bit is the classic RaBitQ setting. You can set it to 2, 4, or 8 bits to improve fidelity for better precision or recall — the main trade-off is additional storage for the extra bits per dimension, with only a modest increase in query-time compute. +1 bit is the classic RaBitQ setting. You can set it to a higher `num_bits` value to improve fidelity for better precision or recall. The main trade-off is additional storage for the extra bits per dimension, with only a modest increase in query-time compute. It's also possible to tune the number of IVF partitions in `IVF_RQ`, similar to how you would do in `IVF_PQ`. diff --git a/docs/indexing/rabitq.mdx b/docs/indexing/rabitq.mdx new file mode 100644 index 00000000..6bc64179 --- /dev/null +++ b/docs/indexing/rabitq.mdx @@ -0,0 +1,113 @@ +--- +title: "RaBitQ Indexing" +sidebarTitle: "RaBitQ" +description: "Use IVF_RQ and multi-bit RaBitQ to improve recall, tail latency, and serving memory for high-dimensional vector search in LanceDB." +icon: "gauge-high" +keywords: ["rabitq", "ivf_rq", "approx_mode", "vector search", "quantization", "recall"] +--- +import { PyRabitqCreateIndex as RabitqCreateIndex } from '/snippets/indexing.mdx'; + +RaBitQ (`IVF_RQ`) is LanceDB's high-compression vector index for large embedding workloads. It is built for the cases where you want strong recall and low latency without keeping full-precision vectors in the hot serving path. + + +The improvements described on this page are available in LanceDB Cloud. + + +## Start with the result + +On a DBpedia 1M benchmark with 1,536-dimensional vectors, `top_k=10`, `nprobes=24`, and no raw-vector refine, multi-bit `IVF_RQ` improves both quality and speed compared with `IVF_PQ`. + +`QPS/core` is single-core throughput. It is not the total throughput of a LanceDB deployment. + +![p99 latency comparison for IVF_PQ and multi-bit IVF_RQ](/static/assets/images/indexing/rabitq-p99-latency.svg) + +![QPS per core comparison for IVF_PQ and multi-bit IVF_RQ](/static/assets/images/indexing/rabitq-qps-per-core.svg) + +| Index | Recall@10 | Avg latency | p99 latency | QPS/core | +| :--- | ---: | ---: | ---: | ---: | +| `IVF_PQ`, no refine | 74.83% | 12.80 ms | 14.72 ms | 78.0 | +| `IVF_RQ`, 3-bit | 93.52% | 3.82 ms | 4.74 ms | 261.0 | +| `IVF_RQ`, 5-bit | 96.24% | 4.56 ms | 5.57 ms | 218.1 | +| `IVF_RQ`, 7-bit | 96.83% | 4.96 ms | 5.94 ms | 200.7 | + +The 5-bit `IVF_RQ` index reaches 96.24% recall while keeping p99 latency about 2.6x lower than `IVF_PQ` in this benchmark. If you prioritize throughput, 3-bit `IVF_RQ` reaches 261 QPS/core with much higher recall than `IVF_PQ`. + +## Higher recall without raw-vector refine + +Classic RaBitQ stores one bit per dimension. Multi-bit RaBitQ keeps the same compact search structure and adds extra bits that preserve more of the original vector signal. + +![Recall comparison for IVF_PQ and multi-bit IVF_RQ](/static/assets/images/indexing/rabitq-recall.svg) + +This matters because high recall with `IVF_PQ` often depends on `refine_factor`: LanceDB first searches the compressed index, then fetches and reranks extra candidates using the original full-precision vectors. That can work well, but it increases the memory and I/O pressure of the serving path. + +With multi-bit `IVF_RQ`, LanceDB can recover much of that quality directly from the quantized index. You can get high recall without making raw-vector refine the default path for every query. + +## Serving memory tradeoff + +The memory difference is easiest to see if you separate index storage from hot serving memory. + +![Hot serving memory comparison for IVF_PQ refine and IVF_RQ modes](/static/assets/images/indexing/rabitq-hot-memory.svg) + +| Serving path | Hot vector memory intuition | +| :--- | :--- | +| `IVF_PQ`, no refine | Very small, but recall can be limited. | +| 5-bit `IVF_RQ` with `approx_mode="fast"` | Can use the 1-bit RaBitQ search path, so hot search memory can match the 1-bit budget. | +| 5-bit `IVF_RQ` with multi-bit scoring | Uses more quantized bits, but still avoids full raw vectors in the hot path. | +| `IVF_PQ` with raw-vector refine | Requires the compressed index plus full-precision vectors for reranking. | + + +A 5-bit `IVF_RQ` index stores more quantized code bytes than a 1-bit `IVF_RQ` index, and can be larger on disk than a default `IVF_PQ` index. The advantage is in high-recall serving: multi-bit `IVF_RQ` can use compact quantized codes for reranking, while `IVF_PQ` typically needs full-precision vectors to recover similar recall. + + +This is also where `approx_mode` becomes useful. A 5-bit `IVF_RQ` index can run a fast query that only uses the 1-bit path, or a higher-recall query that uses more of the stored multi-bit signal. You do not need to rebuild the index to move between those points. + +## How LanceDB makes it fast + +The recent RaBitQ work improves the full query path: + +- **Fast rotation**: RaBitQ uses a randomized rotation before quantization. The optimized rotation path reduces the cost of preparing vectors for compact binary-style scoring. +- **Multi-bit reranking**: Extra bits give LanceDB more signal during candidate scoring, so recall improves without always falling back to raw-vector refine. +- **SIMD distance kernels**: The inner scoring loop runs over packed quantized data and uses CPU vector instructions to evaluate many dimensions at once. +- **A one-bit fast path**: Even if the index stores extra bits, `approx_mode="fast"` can search with the 1-bit representation when you want the lowest-latency path. + +The result is not just a smaller index format. It is a faster high-recall serving path for large, high-dimensional vectors. + +## Tune the tradeoff at query time + +`approx_mode` lets you tune recall and performance per query instead of rebuilding separate indexes for different product needs. The option applies to RQ-quantized indexes such as `IVF_RQ`; other index types ignore it. + +| `approx_mode` | When to use it | +| :--- | :--- | +| `fast` | Lowest-latency path. Useful for autocomplete, exploration, high-fanout retrieval, or queries with recall headroom. | +| `normal` | Default balance. Good starting point for most production traffic. | +| `accurate` | Higher-recall path. Useful for quality-sensitive retrieval, evaluation, and requests where a few extra milliseconds are acceptable. | + +On the same 5-bit `IVF_RQ` index, the query mode controls the speed/quality point: + +![Average latency comparison across approx_mode values](/static/assets/images/indexing/rabitq-approx-mode-latency.svg) + +![QPS per core comparison across approx_mode values](/static/assets/images/indexing/rabitq-approx-mode-qps.svg) + +| `approx_mode` | Recall@10 | Avg latency | Approx QPS/core | +| :--- | ---: | ---: | ---: | +| `fast` | 81.42% | 3.175 ms | 315 | +| `normal` | 96.11% | 3.802 ms | 263 | +| `accurate` | 96.57% | 4.508 ms | 222 | + +Approx QPS/core is computed from the single-core mean latency in this benchmark. Use it for relative comparison, not as a cluster-level throughput estimate. + +See the [vector search guide](/search/vector-search) for the current query API behavior and the full interaction between `approx_mode`, `nprobes`, and `refine_factor`. + +## Create an IVF_RQ index + +To switch a table to RaBitQ, create an `IVF_RQ` index with the `IvfRq` config object. Start with `num_bits=5` when recall matters, and lower it if you want a smaller index. + + + + {RabitqCreateIndex} + + + +All of these RaBitQ updates are available in LanceDB Cloud. Build an `IVF_RQ` index once, then tune recall and performance at query time with `approx_mode` as your workload changes. + +Use `IVF_RQ` when your workload has high-dimensional embeddings, needs strong recall, and cannot afford to keep full-precision vectors hot just to rerank every query. For small vectors, especially dimensions at or below 256, benchmark `IVF_PQ` as well because it can still be a better fit. diff --git a/docs/snippets/indexing.mdx b/docs/snippets/indexing.mdx index 278a097e..e1cd7c09 100644 --- a/docs/snippets/indexing.mdx +++ b/docs/snippets/indexing.mdx @@ -12,6 +12,8 @@ export const PyGpuIndexCuda = "table.create_index(\n num_partitions=256,\n export const PyGpuIndexMps = "table.create_index(\n num_partitions=256,\n num_sub_vectors=96,\n accelerator=\"mps\",\n)\n"; +export const PyRabitqCreateIndex = "from lancedb.index import IvfRq\n\ntable.create_index(\n \"vector\",\n config=IvfRq(\n distance_type=\"cosine\",\n num_bits=5,\n ),\n replace=True,\n)\n"; + export const PyReindexingIncremental = "table = db.open_table(\"reindexing_incremental\")\ntable.add([{\"vector\": [3.1, 4.1], \"text\": \"Frodo was a happy puppy\"}])\ntable.optimize()\n"; export const PyScalarIndexBuild = "tbl = db.open_table(\"scalar_index_build\")\ntbl.create_scalar_index(\"book_id\")\ntbl.create_scalar_index(\"publisher\", index_type=\"BITMAP\")\n"; diff --git a/docs/static/assets/images/indexing/rabitq-approx-mode-latency.svg b/docs/static/assets/images/indexing/rabitq-approx-mode-latency.svg new file mode 100644 index 00000000..da15235c --- /dev/null +++ b/docs/static/assets/images/indexing/rabitq-approx-mode-latency.svg @@ -0,0 +1,35 @@ + + approx_mode latency comparison + Bar chart comparing average latency for fast, normal, and accurate approx_mode settings on the same 5-bit IVF_RQ index. + + approx_mode latency + Same 5-bit IVF_RQ index, lower is better + + + + + + + + + + 0 + 1.25 + 2.50 + 3.75 + 5 ms + + + + + + 3.175 ms + 3.802 ms + 4.508 ms + + + fast + normal + accurate + + diff --git a/docs/static/assets/images/indexing/rabitq-approx-mode-qps.svg b/docs/static/assets/images/indexing/rabitq-approx-mode-qps.svg new file mode 100644 index 00000000..cf839b73 --- /dev/null +++ b/docs/static/assets/images/indexing/rabitq-approx-mode-qps.svg @@ -0,0 +1,35 @@ + + approx_mode QPS per core comparison + Bar chart comparing approximate QPS per core for fast, normal, and accurate approx_mode settings on the same 5-bit IVF_RQ index. + + approx_mode QPS per core + Derived from average latency as 1000 / avg_latency_ms + + + + + + + + + + 0 + 85 + 170 + 255 + 340 + + + + + + 315 + 263 + 222 + + + fast + normal + accurate + + diff --git a/docs/static/assets/images/indexing/rabitq-hot-memory.svg b/docs/static/assets/images/indexing/rabitq-hot-memory.svg new file mode 100644 index 00000000..39c248b2 --- /dev/null +++ b/docs/static/assets/images/indexing/rabitq-hot-memory.svg @@ -0,0 +1,39 @@ + + Serving hot memory comparison + Horizontal bar chart showing relative hot memory for PQ no refine, RQ fast, RQ full 5-bit, and PQ plus refine. + + Serving hot memory by query path + Relative to raw f32 vectors, lower is better + + PQ no refine + RQ fast + RQ full 5-bit + PQ + refine + + + + + + + + + + + + + + + ~1/32 raw + ~1/32 raw + ~5/32 raw + ~33/32 raw + + + 0 + 0.25x + 0.50x + 0.75x + 1.0x raw + + A 5-bit RQ index can use the 1-bit fast path for PQ-like hot memory, or full bits for higher recall. + diff --git a/docs/static/assets/images/indexing/rabitq-p99-latency.svg b/docs/static/assets/images/indexing/rabitq-p99-latency.svg new file mode 100644 index 00000000..eea3dfa4 --- /dev/null +++ b/docs/static/assets/images/indexing/rabitq-p99-latency.svg @@ -0,0 +1,39 @@ + + DBpedia p99 latency comparison + Bar chart comparing p99 latency for IVF_PQ and IVF_RQ 3-bit, 5-bit, and 7-bit indexes on DBpedia 1M. + + DBpedia 1M: p99 latency + Lower is better, nprobes=24, top-k=10, no raw-vector refine + + + + + + + + + + 0 + 4 + 8 + 12 + 16 ms + + + + + + + 14.72 ms + 4.74 ms + 5.57 ms + 5.94 ms + + + IVF_PQ + RQ 3-bit + RQ 5-bit + RQ 7-bit + + 5-bit IVF_RQ keeps p99 latency about 2.6x lower than IVF_PQ while reaching 96.24% recall. + diff --git a/docs/static/assets/images/indexing/rabitq-qps-per-core.svg b/docs/static/assets/images/indexing/rabitq-qps-per-core.svg new file mode 100644 index 00000000..d8b4e3be --- /dev/null +++ b/docs/static/assets/images/indexing/rabitq-qps-per-core.svg @@ -0,0 +1,39 @@ + + DBpedia QPS per core comparison + Bar chart comparing QPS per core for IVF_PQ and IVF_RQ 3-bit, 5-bit, and 7-bit indexes on DBpedia 1M. + + DBpedia 1M: QPS per core + Higher is better, nprobes=24, top-k=10, no raw-vector refine + + + + + + + + + + 0 + 70 + 140 + 210 + 280 + + + + + + + 78.0 + 261.0 + 218.1 + 200.7 + + + IVF_PQ + RQ 3-bit + RQ 5-bit + RQ 7-bit + + QPS is reported per core, not as total system or cluster throughput. + diff --git a/docs/static/assets/images/indexing/rabitq-recall.svg b/docs/static/assets/images/indexing/rabitq-recall.svg new file mode 100644 index 00000000..e68d7605 --- /dev/null +++ b/docs/static/assets/images/indexing/rabitq-recall.svg @@ -0,0 +1,39 @@ + + DBpedia recall comparison + Bar chart comparing recall for IVF_PQ and IVF_RQ 3-bit, 5-bit, and 7-bit indexes on DBpedia 1M. + + DBpedia 1M: Recall@10 + Higher is better, nprobes=24, top-k=10, no raw-vector refine + + + + + + + + + + 70% + 77.5% + 85% + 92.5% + 100% + + + + + + + 74.83% + 93.52% + 96.24% + 96.83% + + + IVF_PQ + RQ 3-bit + RQ 5-bit + RQ 7-bit + + Multi-bit RaBitQ stores more signal in the index, reducing the need for raw-vector refine. + diff --git a/tests/py/test_indexing.py b/tests/py/test_indexing.py index 7f63e8f0..409a54cf 100644 --- a/tests/py/test_indexing.py +++ b/tests/py/test_indexing.py @@ -274,6 +274,29 @@ def test_vector_index_custom_name(tmp_db): assert table.index_stats("my_custom_index") +def test_rabitq_create_index(tmp_db): + table = tmp_db.create_table( + "rabitq_index", + _make_vector_rows(512, 8), + mode="overwrite", + ) + + # --8<-- [start:rabitq_create_index] + from lancedb.index import IvfRq + + table.create_index( + "vector", + config=IvfRq( + distance_type="cosine", + num_bits=5, + ), + replace=True, + ) + # --8<-- [end:rabitq_create_index] + + assert table.list_indices() + + def test_vector_index_hnsw(tmp_db): table = tmp_db.create_table( "vector_index_hnsw",