From 902164cb70792c57e7e9adc41a79cdc4a6d96b42 Mon Sep 17 00:00:00 2001 From: Mingzhuo Yin Date: Fri, 2 Feb 2024 06:02:26 +0000 Subject: [PATCH] fix: breaking change for pgvecto_rs Signed-off-by: Mingzhuo Yin --- engine/clients/pgvector/config.py | 6 ++-- engine/clients/pgvector/search.py | 7 +--- engine/clients/pgvector/upload.py | 16 ++++++---- ...rust_HNSW_single_node_laion-768-5m-ip.json | 16 ++++++---- ...ngle_node_laion-768-5m-probability-ip.json | 32 ++++++++++++------- 5 files changed, 43 insertions(+), 34 deletions(-) diff --git a/engine/clients/pgvector/config.py b/engine/clients/pgvector/config.py index 2d40021..40f1f54 100644 --- a/engine/clients/pgvector/config.py +++ b/engine/clients/pgvector/config.py @@ -13,9 +13,9 @@ } DISTANCE_MAPPING_CREATE_RUST = { - Distance.L2: "l2_ops", - Distance.COSINE: "cosine_ops", - Distance.DOT: "dot_ops", + Distance.L2: "vector_l2_ops", + Distance.COSINE: "vector_cos_ops", + Distance.DOT: "vector_dot_ops", } DISTANCE_MAPPING_SEARCH = { diff --git a/engine/clients/pgvector/search.py b/engine/clients/pgvector/search.py index d36c769..45d0318 100644 --- a/engine/clients/pgvector/search.py +++ b/engine/clients/pgvector/search.py @@ -33,12 +33,7 @@ def search_one(cls, vector: List[float], meta_conditions, top: Optional[int], sc cur.execute("BEGIN;") # set index create parameter for key in cls.search_params["params"].keys(): - if cls.engine_type == "c": - cur.execute(f"SET LOCAL {key} = {cls.search_params['params'][key]};") - else: - # pgvector_rs only support hnsw - cur.execute(f"SET LOCAL vectors.k = {cls.search_params['params']['hnsw.ef_search']};") - break + cur.execute(f"SET LOCAL {key} = {cls.search_params['params'][key]};") meta_conditions = cls.parser.parse(meta_conditions) if meta_conditions: diff --git a/engine/clients/pgvector/upload.py b/engine/clients/pgvector/upload.py index c7e0b54..6c3c3a5 100644 --- a/engine/clients/pgvector/upload.py +++ b/engine/clients/pgvector/upload.py @@ -30,7 +30,7 @@ def upload_batch(cls, ids: List[int], vectors: List[list], metadata: List[Option raise RuntimeError("PGVector batch upload unhealthy") # Getting the names of structured data columns based on the first meta information. col_name_tuple = ('id', 'vector') - col_type_tuple = ('%s', '%s::real[]') + col_type_tuple = ('%s', '%s') if metadata[0] is not None: for col_name in list(metadata[0].keys()): col_name_tuple += (col_name,) @@ -38,7 +38,7 @@ def upload_batch(cls, ids: List[int], vectors: List[list], metadata: List[Option insert_data = [] for i in range(0, len(ids)): - temp_tuple = (ids[i], vectors[i]) + temp_tuple = (ids[i], str(vectors[i])) if metadata[i] is not None: for col_name in list(metadata[i].keys()): value = metadata[i][col_name] @@ -74,11 +74,6 @@ def post_upload(cls, distance): if cls.engine_type == "rust": create_index_command = f""" CREATE INDEX ON {PGVECTOR_INDEX} USING vectors (vector {cls.distance}) WITH (options=$$ -capacity = {int(cls.vector_count*1.2)} -[vectors] -memmap = "ram" -[algorithm.hnsw] -memmap = "ram" {index_options_rust} $$); """ @@ -92,3 +87,10 @@ def post_upload(cls, distance): with cls.conn.cursor() as cur: cur.execute("SELECT phase, tuples_done, tuples_total FROM pg_stat_progress_create_index;") cls.conn.commit() + if cls.engine_type == "rust": + with cls.conn.cursor() as cur: + indexing = True + while indexing: + cur.execute("SELECT idx_indexing FROM pg_vector_index_stat;") + indexing = cur.fetchone()[0] + cls.conn.commit() diff --git a/experiments/needs_editing/pgvector_rust_HNSW_single_node_laion-768-5m-ip.json b/experiments/needs_editing/pgvector_rust_HNSW_single_node_laion-768-5m-ip.json index 8f91e8f..6973602 100644 --- a/experiments/needs_editing/pgvector_rust_HNSW_single_node_laion-768-5m-ip.json +++ b/experiments/needs_editing/pgvector_rust_HNSW_single_node_laion-768-5m-ip.json @@ -6,7 +6,7 @@ "platform": "CloudTest_v0.0.4", "index_type": "HNSW", "dataset": "laion-768-5m-ip", - "version": "pg15-latest", + "version": "pg16-v0.2.0", "branch": "master", "commit": "sha256:38c7e5c4fa3afd48fb4911b3f96489ac287b59f8bfdd9b7b55c0eca898ffec21", "remark": "pgvector implemented in RUST", @@ -24,28 +24,28 @@ "parallel": 4, "top": 10, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100 } }, { "parallel": 4, "top": 100, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100 } }, { "parallel": 8, "top": 10, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100 } }, { "parallel": 8, "top": 100, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100 } } ], @@ -53,8 +53,10 @@ "parallel": 16, "batch_size": 64, "index_params": { - "m": 12, - "ef_construction": 100 + "indexing.hnsw.m": 12, + "indexing.hnsw.ef_construction": 100, + "optimizing.optimizing_threads": 8, + "segment.max_sealed_segment_size": 5000000 }, "index_type": "hnsw", "engine_type": "rust" diff --git a/experiments/needs_editing/pgvector_rust_HNSW_single_node_laion-768-5m-probability-ip.json b/experiments/needs_editing/pgvector_rust_HNSW_single_node_laion-768-5m-probability-ip.json index af2f8f4..5b0bdda 100644 --- a/experiments/needs_editing/pgvector_rust_HNSW_single_node_laion-768-5m-probability-ip.json +++ b/experiments/needs_editing/pgvector_rust_HNSW_single_node_laion-768-5m-probability-ip.json @@ -6,7 +6,7 @@ "platform": "CloudTest_v0.0.4", "index_type": "HNSW", "dataset": "laion-768-5m-ip-probability", - "version": "pg15-latest", + "version": "pg16-v0.2.0", "branch": "master", "commit": "sha256:38c7e5c4fa3afd48fb4911b3f96489ac287b59f8bfdd9b7b55c0eca898ffec21", "remark": "pgvector implemented in RUST", @@ -24,7 +24,8 @@ "parallel": 4, "top": 10, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100, + "vectors.search_mode": "vbase" }, "query_meta": { "probability": 0.01 @@ -34,7 +35,8 @@ "parallel": 4, "top": 10, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100, + "vectors.search_mode": "vbase" }, "query_meta": { "probability": 0.1 @@ -44,7 +46,8 @@ "parallel": 4, "top": 100, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100, + "vectors.search_mode": "vbase" }, "query_meta": { "probability": 0.01 @@ -54,7 +57,8 @@ "parallel": 4, "top": 100, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100, + "vectors.search_mode": "vbase" }, "query_meta": { "probability": 0.1 @@ -64,7 +68,8 @@ "parallel": 8, "top": 10, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100, + "vectors.search_mode": "vbase" }, "query_meta": { "probability": 0.01 @@ -74,7 +79,8 @@ "parallel": 8, "top": 10, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100, + "vectors.search_mode": "vbase" }, "query_meta": { "probability": 0.1 @@ -84,7 +90,8 @@ "parallel": 8, "top": 100, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100, + "vectors.search_mode": "vbase" }, "query_meta": { "probability": 0.01 @@ -94,7 +101,8 @@ "parallel": 8, "top": 100, "params": { - "hnsw.ef_search": 100 + "vectors.hnsw_ef_search": 100, + "vectors.search_mode": "vbase" }, "query_meta": { "probability": 0.1 @@ -105,8 +113,10 @@ "parallel": 16, "batch_size": 64, "index_params": { - "m": 12, - "ef_construction": 100 + "indexing.hnsw.m": 12, + "indexing.hnsw.ef_construction": 100, + "optimizing.optimizing_threads": 8, + "segment.max_sealed_segment_size": 5000000 }, "index_type": "hnsw", "engine_type": "rust"