From a3a804b8c7ba4b9f65a26741f12b1d00f5a671a3 Mon Sep 17 00:00:00 2001
From: root <root@ip-172-31-37-14.ec2.internal>
Date: Sat, 23 May 2026 17:14:47 +0000
Subject: [PATCH 1/6] Add full-parameter CSV export from results.json for
 Pareto analysis.

Export sweep dimensions and real query-throughput into raw and frontier CSVs,
persist outputs under csv-export/, add 1M/10M CAGRA_SEARCH grid sweep specs,
and improve CAGRA_SEARCH index naming and build-plot matching.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 BenchmarkWithCAGRASearchPrototype.md |  25 +++
 README.md                            |  15 +-
 convert_to_nvidia_format.py          |  15 +-
 export_results_csv.py                | 247 +++++++++++++++++++++++++++
 plot_pareto.py                       |  20 ++-
 run_pareto_analysis.sh               |  84 +++------
 sweeps_cagra_search_grid.json        |  38 +++++
 sweeps_cagra_search_grid_1M.json     |  38 +++++
 8 files changed, 412 insertions(+), 70 deletions(-)
 create mode 100644 export_results_csv.py
 create mode 100644 sweeps_cagra_search_grid.json
 create mode 100644 sweeps_cagra_search_grid_1M.json
diff --git a/BenchmarkWithCAGRASearchPrototype.md b/BenchmarkWithCAGRASearchPrototype.md
index 5e90f5f..32c1d9c 100644
--- a/BenchmarkWithCAGRASearchPrototype.md
+++ b/BenchmarkWithCAGRASearchPrototype.md
@@ -21,6 +21,31 @@ I am mentioning the versions below that I use on `ubuntu-24.04`
 - ninja and nccl (used while building cuVS)
 - nvtop
 
+### Python (Pareto CSV export and plots)
+
+After a sweep, `run_sweep.sh` calls `run_pareto_analysis.sh`, which needs:
+
+- Python 3.7+
+- **pandas** — `data_export.py` (build/search CSVs, Pareto frontiers)
+- **matplotlib**, **numpy**, **click** — `plot_pareto.py` (throughput/latency plots)
+- **pyyaml** — optional helpers elsewhere in the repo
+
+Install once:
+
+```sh
+pip install pandas matplotlib numpy click pyyaml
+```
+
+Re-run analysis only (no re-benchmark) after a completed sweep:
+
+```sh
+cd vectorsearch-benchmarks
+./run_pareto_analysis.sh <benchmark-id> <dataset-folder-name>
+# Example: ./run_pareto_analysis.sh wOKdmU wiki1m
+```
+
+Plots land under `results/<benchmark-id>/<dataset>/plots/`. Full-parameter CSVs (raw + Pareto frontiers) are written to `results/<benchmark-id>/csv-export/<dataset>/` via `export_results_csv.py` (all sweep fields from each `results.json`).
+
 You can get the above using the following:
 ```sh
 sudo apt install -y axel ninja-build libnccl2 libnccl-dev nvtop
diff --git a/README.md b/README.md
index be9f7f7..45bcf46 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,19 @@ Benchmark system for comparing CAGRA (GPU) vs Lucene HNSW (CPU) vector search al
 1. **Prerequisites:**
    - JDK 22+
    - CUDA libraries
-   - Python 3.7+
-   - pip install pyyaml matplotlib numpy click pandas
+   - Python 3.7+ with packages for post-run analysis and plots (required if you use `--run-benchmarks`, which triggers Pareto export/plotting):
+
+   ```bash
+   pip install pandas matplotlib numpy click pyyaml
+   ```
+
+   | Package | Used for |
+   |---------|----------|
+   | `pandas` | `data_export.py` — build/search CSVs and Pareto frontiers |
+   | `matplotlib`, `numpy`, `click` | `plot_pareto.py` — throughput/latency plots |
+   | `pyyaml` | YAML utilities in supporting scripts |
+
+   See also [BenchmarkWithCAGRASearchPrototype.md](BenchmarkWithCAGRASearchPrototype.md) for the full CAGRA prototype setup.
 
 2. **Set library paths:**
    ```bash
diff --git a/convert_to_nvidia_format.py b/convert_to_nvidia_format.py
index 54f3931..7b448c8 100755
--- a/convert_to_nvidia_format.py
+++ b/convert_to_nvidia_format.py
@@ -20,6 +20,15 @@ def create_index_name(config: Dict) -> str:
         graph_degree = config.get('cagraGraphDegree', 0)
         intermediate_degree = config.get('cagraIntermediateGraphDegree', 0)
         return f"ef{ef_search}-deg{graph_degree}-ideg{intermediate_degree}"
+    elif algorithm in ['CAGRA_SEARCH', 'cagra_search']:
+        graph_degree = config.get('cagraGraphDegree', 0)
+        intermediate_degree = config.get('cagraIntermediateGraphDegree', 0)
+        search_width = config.get('cagraSearchWidth', 0)
+        query_threads = config.get('queryThreads', 0)
+        return (
+            f"ef{ef_search}-sw{search_width}-deg{graph_degree}-"
+            f"ideg{intermediate_degree}-qt{query_threads}"
+        )
     else:
         return f"ef{ef_search}"
 
@@ -51,7 +60,11 @@ def convert_results_to_nvidia_format(results_json_path: str, output_dir: str, da
         raise KeyError("No mean-latency metric found")
 
     latency_ms = float(metrics[latency_key])
-    throughput = 1000.0 / latency_ms if latency_ms > 0 else 0
+    throughput_key = next((key for key in metrics.keys() if 'query-throughput' in key.lower()), None)
+    if throughput_key:
+        throughput = float(metrics[throughput_key])
+    else:
+        throughput = 1000.0 / latency_ms if latency_ms > 0 else 0
 
     benchmark = {
         "name": f"{algorithm}/{index_name}",
diff --git a/export_results_csv.py b/export_results_csv.py
new file mode 100644
index 0000000..ead2d1b
--- /dev/null
+++ b/export_results_csv.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+"""Export benchmark results.json files to analysis CSVs with full sweep parameters."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import os
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+
+from data_export import get_frontier
+
+# First five columns match plot_pareto.py comma-split indices (recall=2, throughput=3, latency=4).
+CSV_COLUMNS = [
+    "algo_name",
+    "index_name",
+    "recall",
+    "throughput",
+    "latency",
+    "threads",
+    "cpu_time",
+    "end_to_end",
+    "topk",
+    "k",
+    "n_queries",
+    "persistent",
+    "search_width",
+    "total_queries",
+    "build time",
+    "build threads",
+    "build cpu_time",
+    "build GPU",
+    "graph_degree",
+    "intermediate_graph_degree",
+    "label",
+    "run_directory",
+]
+
+
+def _metric(metrics: Dict[str, Any], suffix: str) -> Optional[float]:
+    suffix_lower = suffix.lower()
+    for key, value in metrics.items():
+        if suffix_lower in key.lower():
+            if value is None:
+                return None
+            if isinstance(value, str):
+                if value.lower() == "nan":
+                    return float("nan")
+                try:
+                    return float(value)
+                except ValueError:
+                    return None
+            try:
+                return float(value)
+            except (TypeError, ValueError):
+                return None
+    return None
+
+
+def create_build_index_name(config: Dict[str, Any]) -> str:
+    """Index identity for build metrics (excludes query-thread suffix)."""
+    name = create_index_name(config)
+    if config.get("algoToRun") in ["CAGRA_SEARCH", "cagra_search"]:
+        return re.sub(r"-qt\d+$", "", name)
+    return name
+
+
+def create_index_name(config: Dict[str, Any]) -> str:
+    algorithm = config.get("algoToRun", "UNKNOWN")
+    ef_search = config.get("efSearch", 0)
+
+    if algorithm in ["LUCENE_HNSW", "hnsw"]:
+        beam_width = config.get("hnswBeamWidth", 0)
+        max_conn = config.get("hnswMaxConn", 0)
+        return f"beam{beam_width}-conn{max_conn}-ef{ef_search}"
+    if algorithm in ["CAGRA_HNSW", "cagra_hnsw"]:
+        graph_degree = config.get("cagraGraphDegree", 0)
+        intermediate_degree = config.get("cagraIntermediateGraphDegree", 0)
+        return f"ef{ef_search}-deg{graph_degree}-ideg{intermediate_degree}"
+    if algorithm in ["CAGRA_SEARCH", "cagra_search"]:
+        graph_degree = config.get("cagraGraphDegree", 0)
+        intermediate_degree = config.get("cagraIntermediateGraphDegree", 0)
+        search_width = config.get("cagraSearchWidth", 0)
+        query_threads = config.get("queryThreads", 0)
+        return (
+            f"ef{ef_search}-sw{search_width}-deg{graph_degree}-"
+            f"ideg{intermediate_degree}-qt{query_threads}"
+        )
+    return f"ef{ef_search}"
+
+
+def create_label(config: Dict[str, Any]) -> str:
+    algo = config.get("algoToRun", "")
+    parts = [
+        f"ef={config.get('efSearch', '')}",
+        f"sw={config.get('cagraSearchWidth', '')}",
+        f"gd={config.get('cagraGraphDegree', '')}",
+        f"ig={config.get('cagraIntermediateGraphDegree', '')}",
+        f"qt={config.get('queryThreads', '')}",
+    ]
+    return f"{algo} " + " ".join(parts)
+
+
+def row_from_results(results_path: str) -> Dict[str, Any]:
+    with open(results_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    config = data["configuration"]
+    metrics = data.get("metrics", {})
+    algo = config.get("algoToRun", "UNKNOWN")
+
+    recall_pct = _metric(metrics, "recall-accuracy")
+    recall = recall_pct / 100.0 if recall_pct is not None and not math.isnan(recall_pct) else float("nan")
+
+    throughput = _metric(metrics, "query-throughput")
+    latency_ms = _metric(metrics, "mean-latency")
+    cpu_time = _metric(metrics, "mean-retrieval-latency")
+    end_to_end = _metric(metrics, "query-time")
+    build_time = _metric(metrics, "indexing-time")
+
+    top_k = config.get("topK")
+    total_queries = config.get("numQueriesToRun")
+    warmup = config.get("numWarmUpQueries", 0)
+    timed_queries = (total_queries - warmup) if total_queries is not None and warmup is not None else None
+
+    persistent = not config.get("createIndexInMemory", False)
+
+    return {
+        "algo_name": algo,
+        "index_name": create_index_name(config),
+        "recall": recall,
+        "throughput": throughput if throughput is not None else float("nan"),
+        "latency": latency_ms if latency_ms is not None else float("nan"),
+        "threads": config.get("queryThreads"),
+        "cpu_time": cpu_time if cpu_time is not None else float("nan"),
+        "end_to_end": end_to_end if end_to_end is not None else float("nan"),
+        "topk": top_k,
+        "k": top_k,
+        "n_queries": timed_queries,
+        "persistent": persistent,
+        "search_width": config.get("cagraSearchWidth"),
+        "total_queries": total_queries,
+        "build time": build_time if build_time is not None else float("nan"),
+        "build threads": config.get("numIndexThreads"),
+        "build cpu_time": float("nan"),
+        "build GPU": build_time if build_time is not None else float("nan"),
+        "graph_degree": config.get("cagraGraphDegree"),
+        "intermediate_graph_degree": config.get("cagraIntermediateGraphDegree"),
+        "label": create_label(config),
+        "run_directory": config.get("resultsDirectory", results_path),
+    }
+
+
+def collect_rows(sweep_dir: str) -> List[Dict[str, Any]]:
+    rows: List[Dict[str, Any]] = []
+    for root, _, files in os.walk(sweep_dir):
+        if "results.json" not in files:
+            continue
+        path = os.path.join(root, "results.json")
+        try:
+            rows.append(row_from_results(path))
+        except Exception as exc:
+            print(f"Error processing {path}: {exc}")
+    return rows
+
+
+def export_sweep_results(
+    sweep_dir: str,
+    output_dir: str,
+    dataset_name: str,
+) -> Dict[str, str]:
+    rows = collect_rows(sweep_dir)
+    if not rows:
+        raise RuntimeError(f"No results.json files under {sweep_dir}")
+
+    df = pd.DataFrame(rows)[CSV_COLUMNS]
+    k = int(df["topk"].iloc[0])
+    n_queries = int(df["total_queries"].iloc[0])
+
+    dataset_out = Path(output_dir) / dataset_name
+    dataset_out.mkdir(parents=True, exist_ok=True)
+    plot_search_dir = dataset_out / "result" / "search"
+    plot_build_dir = dataset_out / "result" / "build"
+    plot_search_dir.mkdir(parents=True, exist_ok=True)
+    plot_build_dir.mkdir(parents=True, exist_ok=True)
+
+    written: Dict[str, str] = {}
+
+    for algo_name, algo_df in df.groupby("algo_name", sort=False):
+        prefix = f"{algo_name},base,k{k},bs{n_queries}"
+
+        raw_path = dataset_out / f"{prefix},raw.csv"
+        algo_df.to_csv(raw_path, index=False)
+        written["raw"] = str(raw_path)
+
+        throughput_frontier = get_frontier(algo_df, "throughput")
+        throughput_path = dataset_out / f"{prefix},throughput.csv"
+        throughput_frontier.to_csv(throughput_path, index=False)
+        written["throughput"] = str(throughput_path)
+
+        latency_frontier = get_frontier(algo_df, "latency")
+        latency_path = dataset_out / f"{prefix},latency.csv"
+        latency_frontier.to_csv(latency_path, index=False)
+        written["latency"] = str(latency_path)
+
+        algo_df.to_csv(plot_search_dir / raw_path.name, index=False)
+        throughput_frontier.to_csv(plot_search_dir / throughput_path.name, index=False)
+        latency_frontier.to_csv(plot_search_dir / latency_path.name, index=False)
+
+        build_df = algo_df[algo_df["build time"].notna()].copy()
+        if not build_df.empty:
+            build_export = build_df.copy()
+            build_export["index_name"] = build_export.apply(
+                lambda row: re.sub(r"-qt\d+$", "", row["index_name"]),
+                axis=1,
+            )
+            build_export = build_export[
+                ["algo_name", "index_name", "build time"]
+            ].drop_duplicates(subset=["algo_name", "index_name"])
+            build_path = dataset_out / f"{algo_name},base.csv"
+            build_export.to_csv(build_path, index=False)
+            build_plot_path = plot_build_dir / build_path.name
+            build_export.to_csv(build_plot_path, index=False)
+            written["build"] = str(build_path)
+
+    print(f"Exported {len(df)} runs for dataset {dataset_name}")
+    print(f"  Algorithms: {', '.join(df['algo_name'].unique())}")
+    return written
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Export results.json to analysis CSVs")
+    parser.add_argument("--sweep-dir", required=True, help="Directory containing per-run results")
+    parser.add_argument("--output-dir", required=True, help="Output directory (e.g. intermediate-files)")
+    parser.add_argument("--dataset", required=True, help="Dataset subdirectory name")
+    args = parser.parse_args()
+
+    export_sweep_results(args.sweep_dir, args.output_dir, args.dataset)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plot_pareto.py b/plot_pareto.py
index a481b23..90128a4 100755
--- a/plot_pareto.py
+++ b/plot_pareto.py
@@ -215,6 +215,19 @@ def inv_fun(x):
 def create_plot_build(
     build_results, search_results, linestyles, fn_out, dataset, k, n_queries
 ):
+    import re
+
+    def resolve_build_key(algo_name, index_name):
+        """Match search runs to build runs (build index names omit query-thread suffix)."""
+        candidates = [
+            (algo_name, index_name),
+            (algo_name, re.sub(r"-qt\d+$", "", index_name)),
+        ]
+        for key in candidates:
+            if key in build_results:
+                return key
+        return None
+
     bt_80 = [0] * len(linestyles)
     bt_90 = [0] * len(linestyles)
     bt_95 = [0] * len(linestyles)
@@ -237,8 +250,8 @@ def mean_y(algo):
 
         len_80, len_90, len_95, len_99 = 0, 0, 0, 0
         for i in range(len(xs)):
-            build_key = (ls[i], idxs[i])
-            if build_key not in build_results:
+            build_key = resolve_build_key(ls[i], idxs[i])
+            if build_key is None:
                 continue  # Skip if build result not found
             
             if xs[i] >= 0.80 and xs[i] < 0.90:
@@ -279,6 +292,9 @@ def mean_y(algo):
     df = pd.DataFrame(data, index=index)
     df.replace(0.0, np.nan, inplace=True)
     df = df.dropna(how="all")
+    if df.empty:
+        print(f"Skipping build plot (no build/search index matches): {fn_out}")
+        return
     plt.figure(figsize=(12, 9))
     ax = df.plot.bar(rot=0, color=colors)
     fig = ax.get_figure()
diff --git a/run_pareto_analysis.sh b/run_pareto_analysis.sh
index 33020a8..3eef124 100755
--- a/run_pareto_analysis.sh
+++ b/run_pareto_analysis.sh
@@ -25,76 +25,26 @@ echo "Processing sweep: ${SWEEP_ID}, dataset: ${DATASET_NAME}"
 
 rm -rf "${INTERMEDIATE_DIR}" "${OUTPUT_DIR}/plots"
 
-echo "Converting results to NVIDIA format..."
-python3 convert_to_nvidia_format.py --sweep-dir "${INPUT_DIR}/${DATASET_NAME}" --output-dir "${INTERMEDIATE_DIR}" --dataset "${DATASET_NAME}"
-
-echo "Generating Pareto frontier CSVs..."
-python3 -c "
-import sys
-sys.path.append('.')
-from data_export import convert_json_to_csv_search, convert_json_to_csv_build
-convert_json_to_csv_search('${DATASET_NAME}', '${INTERMEDIATE_DIR}')
-convert_json_to_csv_build('${DATASET_NAME}', '${INTERMEDIATE_DIR}')
-"
-
-if [ $? -ne 0 ]; then
-    echo "Error: NVIDIA data_export.py failed. Exiting."
-    exit 1
-fi
-
-FIRST_RESULTS=$(find "${INPUT_DIR}" -name "results.json" | head -1)
+FIRST_RESULTS=$(find "${INPUT_DIR}/${DATASET_NAME}" -name "results.json" | head -1)
 K=$(python3 -c "import json; print(json.load(open('${FIRST_RESULTS}'))['configuration']['topK'])")
 N_QUERIES=$(python3 -c "import json; print(json.load(open('${FIRST_RESULTS}'))['configuration']['numQueriesToRun'])")
 
-echo "Creating directory structure for plotting..."
-mkdir -p "${INTERMEDIATE_DIR}/${DATASET_NAME}/result/search"
-mkdir -p "${INTERMEDIATE_DIR}/${DATASET_NAME}/result/build"
-
-if [ -d "${INTERMEDIATE_DIR}/${DATASET_NAME}" ]; then
-    cd "${INTERMEDIATE_DIR}/${DATASET_NAME}"
-
-    for file in *throughput.csv *latency.csv *raw.csv; do
-        if [ -f "$file" ]; then
-            if [[ "$file" == *",raw.csv" ]]; then
-                mv "$file" "result/search/${file%,raw.csv},k${K},bs${N_QUERIES},raw.csv"
-            elif [[ "$file" == *",throughput.csv" ]]; then
-                mv "$file" "result/search/${file%,throughput.csv},k${K},bs${N_QUERIES},throughput.csv"
-            elif [[ "$file" == *",latency.csv" ]]; then
-                mv "$file" "result/search/${file%,latency.csv},k${K},bs${N_QUERIES},latency.csv"
-            fi
-        fi
-    done
-
-    for file in *.csv; do
-        if [ -f "$file" ]; then
-            mv "$file" "result/build/"
-        fi
-    done
-
-    cd - > /dev/null
-fi
+echo "Exporting full-parameter CSVs from results.json..."
+python3 export_results_csv.py \
+    --sweep-dir "${INPUT_DIR}/${DATASET_NAME}" \
+    --output-dir "${INTERMEDIATE_DIR}" \
+    --dataset "${DATASET_NAME}"
 
 echo "Generating is_pareto files for Pareto optimal runs..."
 python3 -c "
 import os
+import sys
 import csv
 import json
 import glob
 
-def create_index_name_from_config(config):
-    algorithm = config.get('algoToRun', 'UNKNOWN')
-    ef_search = config.get('efSearch', 0)
-
-    if algorithm in ['LUCENE_HNSW', 'hnsw']:
-        beam_width = config.get('hnswBeamWidth', 0)
-        max_conn = config.get('hnswMaxConn', 0)
-        return f'beam{beam_width}-conn{max_conn}-ef{ef_search}'
-    elif algorithm in ['CAGRA_HNSW', 'cagra_hnsw']:
-        graph_degree = config.get('cagraGraphDegree', 0)
-        intermediate_degree = config.get('cagraIntermediateGraphDegree', 0)
-        return f'ef{ef_search}-deg{graph_degree}-ideg{intermediate_degree}'
-    else:
-        return f'ef{ef_search}'
+sys.path.append('.')
+from export_results_csv import create_index_name as create_index_name_from_config
 
 intermediate_dir = '${INTERMEDIATE_DIR}/${DATASET_NAME}'
 results_dir = '${RESULTS_DIR}/${SWEEP_ID}/${DATASET_NAME}'
@@ -153,11 +103,7 @@ for algorithm, pareto_indices in pareto_runs_by_algo.items():
                 config = results_data['configuration']
                 algo_to_run = config.get('algoToRun')
 
-                algorithm_match = False
-                if algorithm == 'CAGRA_HNSW' and algo_to_run in ['CAGRA_HNSW', 'cagra_hnsw']:
-                    algorithm_match = True
-                elif algorithm == 'LUCENE_HNSW' and algo_to_run in ['LUCENE_HNSW', 'hnsw']:
-                    algorithm_match = True
+                algorithm_match = algo_to_run == algorithm
 
                 if algorithm_match:
                     index_name = create_index_name_from_config(config)
@@ -210,6 +156,14 @@ echo "Plots: ${OUTPUT_DIR}/plots/"
 ls -la "${OUTPUT_DIR}/plots"/*.png
 
 echo ""
+echo "Exporting CSVs to csv-export..."
+CSV_EXPORT_DIR="${INPUT_DIR}/csv-export/${DATASET_NAME}"
+rm -rf "${CSV_EXPORT_DIR}"
+mkdir -p "${CSV_EXPORT_DIR}"
+find "${INTERMEDIATE_DIR}/${DATASET_NAME}" -maxdepth 1 -name '*.csv' -exec cp -t "${CSV_EXPORT_DIR}" {} +
+echo "CSVs saved to: ${CSV_EXPORT_DIR}/"
+ls -la "${CSV_EXPORT_DIR}/"
+
 echo "Cleaning up intermediate files..."
 rm -rf "${INTERMEDIATE_DIR}"
 echo "Intermediate files cleaned up!"
@@ -217,4 +171,4 @@ echo ""
 echo "Final output:"
 echo "- Pareto optimal runs marked with is_pareto files"
 echo "- Plots: ${OUTPUT_DIR}/plots/"
-echo "- No intermediate files (completely cleaned up)"
\ No newline at end of file
+echo "- CSV export: ${CSV_EXPORT_DIR}/"
\ No newline at end of file
diff --git a/sweeps_cagra_search_grid.json b/sweeps_cagra_search_grid.json
new file mode 100644
index 0000000..a53d2a3
--- /dev/null
+++ b/sweeps_cagra_search_grid.json
@@ -0,0 +1,38 @@
+{
+  "wiki10m_cagra_search_grid": {
+    "dataset": "wiki-10m",
+    "common-params": {
+      "numDocs": 10000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [50, 100, 150],
+      "queryThreads": [1, 8, 64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_SEARCH": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphDegree": [16, 32, 64, 128],
+        "cagraIntermediateGraphDegree": [32, 64, 128, 256, 512],
+        "cagraHnswLayers": [1],
+        "cagraGraphBuildAlgo": "NN_DESCENT",
+        "cuvsWriterThreads": 32,
+        "numIndexThreads": 32,
+        "cagraITopK": 15,
+        "cagraSearchWidth": [4, 16, 128],
+        "cagraSearchAlgo": "SINGLE_CTA",
+        "cagraThreadBlockSize": [0]
+      }
+    }
+  }
+}
diff --git a/sweeps_cagra_search_grid_1M.json b/sweeps_cagra_search_grid_1M.json
new file mode 100644
index 0000000..4a47be5
--- /dev/null
+++ b/sweeps_cagra_search_grid_1M.json
@@ -0,0 +1,38 @@
+{
+  "wiki1m_cagra_search_grid": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [50, 100, 150],
+      "queryThreads": [1, 8, 64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_SEARCH": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphDegree": [16, 32, 64, 128],
+        "cagraIntermediateGraphDegree": [32, 64, 128, 256, 512],
+        "cagraHnswLayers": [1],
+        "cagraGraphBuildAlgo": "NN_DESCENT",
+        "cuvsWriterThreads": 32,
+        "numIndexThreads": 32,
+        "cagraITopK": 15,
+        "cagraSearchWidth": [4, 16, 128],
+        "cagraSearchAlgo": "SINGLE_CTA",
+        "cagraThreadBlockSize": [0]
+      }
+    }
+  }
+}

From 66647a63dc89b364f1981fc723b0b15ffd2cdc3b Mon Sep 17 00:00:00 2001
From: root <root@ip-172-31-37-14.ec2.internal>
Date: Sun, 24 May 2026 01:13:06 +0000
Subject: [PATCH 2/6] Organize sweeps under sweeps/ and wire IVF_PQ build
 params in harness.

Move sweep JSON into sweeps/ with wiki 1M grids (CAGRA search, IVF_PQ, hnswlib baseline). Use CUSTOM strategy and pass cuVSIvfPqParams for CAGRA_SEARCH/CAGRA_HNSW when build algo is IVF_PQ. Add dataset checksums for prepare-datasets.sh.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 BenchmarkWithCAGRASearchPrototype.md          |  2 +-
 datasets_test_1M.json                         |  3 +
 .../cuvs/benchmarks/LuceneCuvsBenchmarks.java | 22 ++++---
 sweeps/README.md                              | 26 ++++++++
 sweeps/cagra_ivfpq_1M.json                    | 60 +++++++++++++++++++
 .../cagra_search_grid.json                    |  0
 .../cagra_search_grid_1M.json                 |  0
 sweeps/cagra_search_sw128_retest_1M.json      | 38 ++++++++++++
 sweeps.json => sweeps/default.json            |  0
 sweeps/hnswlib_base_1M.json                   | 31 ++++++++++
 sweeps/lucene_hnsw_test_1M.json               | 31 ++++++++++
 sweeps_test_1M.json => sweeps/test_1M.json    |  0
 .../0/sweeps_trial_cag-hnsw-binary.json       |  0
 .../0/sweeps_trial_cag-hnsw-scalar.json       |  0
 .../trials}/0/sweeps_trial_cag-hnsw.json      |  0
 .../trials}/0/sweeps_trial_cag-search.json    |  0
 .../trials}/0/sweeps_trial_luc-hnsw.json      |  0
 .../trials}/1/sweeps_10M_cag-hnsw.json        |  0
 .../trials}/1/sweeps_10M_luc-hnsw.json        |  0
 19 files changed, 205 insertions(+), 8 deletions(-)
 create mode 100644 sweeps/README.md
 create mode 100644 sweeps/cagra_ivfpq_1M.json
 rename sweeps_cagra_search_grid.json => sweeps/cagra_search_grid.json (100%)
 rename sweeps_cagra_search_grid_1M.json => sweeps/cagra_search_grid_1M.json (100%)
 create mode 100644 sweeps/cagra_search_sw128_retest_1M.json
 rename sweeps.json => sweeps/default.json (100%)
 create mode 100644 sweeps/hnswlib_base_1M.json
 create mode 100644 sweeps/lucene_hnsw_test_1M.json
 rename sweeps_test_1M.json => sweeps/test_1M.json (100%)
 rename {trials => sweeps/trials}/0/sweeps_trial_cag-hnsw-binary.json (100%)
 rename {trials => sweeps/trials}/0/sweeps_trial_cag-hnsw-scalar.json (100%)
 rename {trials => sweeps/trials}/0/sweeps_trial_cag-hnsw.json (100%)
 rename {trials => sweeps/trials}/0/sweeps_trial_cag-search.json (100%)
 rename {trials => sweeps/trials}/0/sweeps_trial_luc-hnsw.json (100%)
 rename {trials => sweeps/trials}/1/sweeps_10M_cag-hnsw.json (100%)
 rename {trials => sweeps/trials}/1/sweeps_10M_luc-hnsw.json (100%)

diff --git a/BenchmarkWithCAGRASearchPrototype.md b/BenchmarkWithCAGRASearchPrototype.md
index 32c1d9c..8bfb501 100644
--- a/BenchmarkWithCAGRASearchPrototype.md
+++ b/BenchmarkWithCAGRASearchPrototype.md
@@ -127,7 +127,7 @@ Run sweeps as below (modify according to your local setup):
 
 ```sh
 cd vectorsearch-benchmarks
-CUDA_DEVICE_MAX_CONNECTIONS=<1 to 32, default 8; should size according to `queryThreads` in sweep config> ./run_sweep.sh --data-dir ../data --datasets datasets_test_1M.json --sweeps sweeps_test_1M.json --configs-dir configs --results-dir results --run-benchmarks
+CUDA_DEVICE_MAX_CONNECTIONS=<1 to 32, default 8; should size according to `queryThreads` in sweep config> ./run_sweep.sh --data-dir ../data --datasets datasets_test_1M.json --sweeps sweeps/test_1M.json --configs-dir configs --results-dir results --run-benchmarks
 ```
 
 If needed, ensure files in data/wiki_all_1M are named as follows:
diff --git a/datasets_test_1M.json b/datasets_test_1M.json
index 601468b..83170c5 100644
--- a/datasets_test_1M.json
+++ b/datasets_test_1M.json
@@ -3,8 +3,11 @@
     "wiki_all_1M": {
       "description": "Wikipedia 768-dimensions, 1M vectors",
       "base_file": "base.1M.fbin",
+      "base_checksum": "b1e81f8cdcf940688ad0131948b890c1cc8029cccb1b7fec5cfb57a7091ed382",
       "query_file": "queries.fbin",
+      "query_checksum": "57c88c301a6ba032855af3a54bdb847a972004e1ab671ce1bff790ee9c39c855",
       "ground_truth_file": "groundtruth.1M.neighbors.ibin",
+      "ground_truth_checksum": "44bb3fdc24c76b150f3b44fb9df707cd2b610d2a044e10388f96ae0b3e074f95",
       "num_docs": 1000000,
       "vector_dimension": 768,
       "top_k_ground_truth": 100
diff --git a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/LuceneCuvsBenchmarks.java b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/LuceneCuvsBenchmarks.java
index fd424e3..7a0843f 100644
--- a/src/main/java/com/searchscale/lucene/cuvs/benchmarks/LuceneCuvsBenchmarks.java
+++ b/src/main/java/com/searchscale/lucene/cuvs/benchmarks/LuceneCuvsBenchmarks.java
@@ -2,6 +2,7 @@
 
 import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN;
 
+import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo;
 import com.nvidia.cuvs.CuVSIvfPqIndexParams;
 import com.nvidia.cuvs.CuVSIvfPqParams;
 import com.nvidia.cuvs.CuVSIvfPqSearchParams;
@@ -730,7 +731,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
               .withRefinementRate(config.cuVSIvfPqParamsRefinementRate)
               .build();
 
-      AcceleratedHNSWParams params =
+      AcceleratedHNSWParams.Builder acceleratedHnswBuilder =
           new AcceleratedHNSWParams.Builder()
               .withWriterThreads(config.cuvsWriterThreads)
               .withIntermediateGraphDegree(config.cagraIntermediateGraphDegree)
@@ -739,22 +740,29 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
               .withMaxConn(config.hnswMaxConn)
               .withBeamWidth(config.hnswBeamWidth)
               .withCagraGraphBuildAlgo(config.cagraGraphBuildAlgo)
-              .withCuVSIvfPqParams(cip)
-              .build();
+              .withCuVSIvfPqParams(cip);
+      if (config.cagraGraphBuildAlgo == CagraGraphBuildAlgo.IVF_PQ) {
+        acceleratedHnswBuilder.withStrategy(AcceleratedHNSWParams.Strategy.CUSTOM);
+      }
+      AcceleratedHNSWParams params = acceleratedHnswBuilder.build();
 
       if (config.algoToRun.equals(Codex.CAGRA_HNSW)) {
         log.info("<<< Using Lucene101AcceleratedHNSWCodec >>>");
         return new Lucene101AcceleratedHNSWCodec(params);
       } else if (config.algoToRun.equals(Codex.CAGRA_SEARCH)) {
         log.info("<<< Using CuVS2510GPUSearchCodec >>>");
-        GPUSearchParams gpuParams =
+        GPUSearchParams.Builder gpuSearchBuilder =
             new GPUSearchParams.Builder()
                 .withCagraGraphBuildAlgo(config.cagraGraphBuildAlgo)
                 .withWriterThreads(config.cuvsWriterThreads)
                 .withIntermediateGraphDegree(config.cagraIntermediateGraphDegree)
-                .withGraphDegree(config.cagraGraphDegree)
-                .build();
-        return new CuVS2510GPUSearchCodec(gpuParams);
+                .withGraphDegree(config.cagraGraphDegree);
+        if (config.cagraGraphBuildAlgo == CagraGraphBuildAlgo.IVF_PQ) {
+          gpuSearchBuilder
+              .withStrategy(GPUSearchParams.Strategy.CUSTOM)
+              .withCuVSIvfPqParams(cip);
+        }
+        return new CuVS2510GPUSearchCodec(gpuSearchBuilder.build());
       } else if (config.algoToRun.equals(Codex.CAGRA_HNSW_BINARY)) {
         log.info("<<< Using LuceneAcceleratedHNSWBinaryQuantizedCodec >>>");
         return new LuceneAcceleratedHNSWBinaryQuantizedCodec(params);
diff --git a/sweeps/README.md b/sweeps/README.md
new file mode 100644
index 0000000..bad2598
--- /dev/null
+++ b/sweeps/README.md
@@ -0,0 +1,26 @@
+# Sweep definitions
+
+Pass any file here to `run_sweep.sh` with `--sweeps sweeps/<file>.json`.
+
+| File | Description |
+|------|-------------|
+| `default.json` | Upstream default sweep |
+| `test_1M.json` | Small 1M wiki smoke (CAGRA_SEARCH + CAGRA_HNSW) |
+| `cagra_search_grid_1M.json` | Full CAGRA_SEARCH NN_DESCENT grid (wiki 1M) |
+| `cagra_search_grid.json` | CAGRA_SEARCH grid (10M dataset name) |
+| `cagra_search_sw128_retest_1M.json` | Single sw128 failure retest |
+| `hnswlib_base_1M.json` | LUCENE_HNSW hnswlib-style grid |
+| `lucene_hnsw_test_1M.json` | Single LUCENE_HNSW smoke |
+| `cagra_ivfpq_1M.json` | CAGRA_SEARCH + CAGRA_HNSW, IVF_PQ build |
+| `trials/0/`, `trials/1/` | Historical trial sweeps |
+
+Example:
+
+```bash
+./run_sweep.sh --data-dir /raid/workspace/data \
+  --datasets datasets_test_1M.json \
+  --sweeps sweeps/cagra_ivfpq_1M.json \
+  --configs-dir configs_cagra_ivfpq_1m \
+  --results-dir results \
+  --run-benchmarks
+```
diff --git a/sweeps/cagra_ivfpq_1M.json b/sweeps/cagra_ivfpq_1M.json
new file mode 100644
index 0000000..4a6894c
--- /dev/null
+++ b/sweeps/cagra_ivfpq_1M.json
@@ -0,0 +1,60 @@
+{
+  "wiki1m_cagra_ivfpq": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [50, 100, 150],
+      "queryThreads": [1, 8, 64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_SEARCH": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphBuildAlgo": "IVF_PQ",
+        "cagraGraphDegree": [32],
+        "cagraIntermediateGraphDegree": [64],
+        "cuvsWriterThreads": 32,
+        "numIndexThreads": 32,
+        "cagraITopK": 15,
+        "cagraSearchWidth": [4, 16],
+        "cagraSearchAlgo": "SINGLE_CTA",
+        "cagraThreadBlockSize": [0],
+        "cuVSIvfPqIndexParamsNLists": [1024, 2048],
+        "cuVSIvfPqIndexParamsPqDim": [0],
+        "cuVSIvfPqIndexParamsPqBits": [8],
+        "cuVSIvfPqIndexParamsKmeansNIters": [25],
+        "cuVSIvfPqSearchParamsNProbes": [10, 20, 50],
+        "cuVSIvfPqParamsRefinementRate": [1, 2]
+      },
+      "CAGRA_HNSW": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphBuildAlgo": "IVF_PQ",
+        "cagraGraphDegree": [32],
+        "cagraIntermediateGraphDegree": [64],
+        "cagraHnswLayers": [1],
+        "cuvsWriterThreads": 32,
+        "numIndexThreads": 32,
+        "hnswMaxConn": [32],
+        "hnswBeamWidth": [32],
+        "cuVSIvfPqIndexParamsNLists": [1024, 2048],
+        "cuVSIvfPqIndexParamsPqDim": [0],
+        "cuVSIvfPqIndexParamsPqBits": [8],
+        "cuVSIvfPqIndexParamsKmeansNIters": [25],
+        "cuVSIvfPqSearchParamsNProbes": [10, 20, 50],
+        "cuVSIvfPqParamsRefinementRate": [1, 2]
+      }
+    }
+  }
+}
diff --git a/sweeps_cagra_search_grid.json b/sweeps/cagra_search_grid.json
similarity index 100%
rename from sweeps_cagra_search_grid.json
rename to sweeps/cagra_search_grid.json
diff --git a/sweeps_cagra_search_grid_1M.json b/sweeps/cagra_search_grid_1M.json
similarity index 100%
rename from sweeps_cagra_search_grid_1M.json
rename to sweeps/cagra_search_grid_1M.json
diff --git a/sweeps/cagra_search_sw128_retest_1M.json b/sweeps/cagra_search_sw128_retest_1M.json
new file mode 100644
index 0000000..c9b6b05
--- /dev/null
+++ b/sweeps/cagra_search_sw128_retest_1M.json
@@ -0,0 +1,38 @@
+{
+  "wiki1m_sw128_retest": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [100],
+      "queryThreads": [8],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_SEARCH": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphDegree": [64],
+        "cagraIntermediateGraphDegree": [128],
+        "cagraHnswLayers": [1],
+        "cagraGraphBuildAlgo": "NN_DESCENT",
+        "cuvsWriterThreads": 32,
+        "numIndexThreads": 32,
+        "cagraITopK": 15,
+        "cagraSearchWidth": [128],
+        "cagraSearchAlgo": "SINGLE_CTA",
+        "cagraThreadBlockSize": [0]
+      }
+    }
+  }
+}
diff --git a/sweeps.json b/sweeps/default.json
similarity index 100%
rename from sweeps.json
rename to sweeps/default.json
diff --git a/sweeps/hnswlib_base_1M.json b/sweeps/hnswlib_base_1M.json
new file mode 100644
index 0000000..a1abcde
--- /dev/null
+++ b/sweeps/hnswlib_base_1M.json
@@ -0,0 +1,31 @@
+{
+  "wiki1m_hnswlib_base": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [100, 120, 200, 400, 600, 800],
+      "queryThreads": [1, 8, 64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "LUCENE_HNSW": {
+        "indexDirPath": "hnswIndex",
+        "hnswMaxConn": [12, 16, 24, 36],
+        "hnswBeamWidth": [64, 128, 256, 512],
+        "hnswMergeThreads": 32,
+        "numIndexThreads": 32
+      }
+    }
+  }
+}
diff --git a/sweeps/lucene_hnsw_test_1M.json b/sweeps/lucene_hnsw_test_1M.json
new file mode 100644
index 0000000..440dd0d
--- /dev/null
+++ b/sweeps/lucene_hnsw_test_1M.json
@@ -0,0 +1,31 @@
+{
+  "wiki1m_lucene_hnsw_test": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [100],
+      "queryThreads": [8],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "LUCENE_HNSW": {
+        "indexDirPath": "hnswIndex",
+        "hnswMaxConn": [32],
+        "hnswBeamWidth": [32],
+        "hnswMergeThreads": 32,
+        "numIndexThreads": 32
+      }
+    }
+  }
+}
diff --git a/sweeps_test_1M.json b/sweeps/test_1M.json
similarity index 100%
rename from sweeps_test_1M.json
rename to sweeps/test_1M.json
diff --git a/trials/0/sweeps_trial_cag-hnsw-binary.json b/sweeps/trials/0/sweeps_trial_cag-hnsw-binary.json
similarity index 100%
rename from trials/0/sweeps_trial_cag-hnsw-binary.json
rename to sweeps/trials/0/sweeps_trial_cag-hnsw-binary.json
diff --git a/trials/0/sweeps_trial_cag-hnsw-scalar.json b/sweeps/trials/0/sweeps_trial_cag-hnsw-scalar.json
similarity index 100%
rename from trials/0/sweeps_trial_cag-hnsw-scalar.json
rename to sweeps/trials/0/sweeps_trial_cag-hnsw-scalar.json
diff --git a/trials/0/sweeps_trial_cag-hnsw.json b/sweeps/trials/0/sweeps_trial_cag-hnsw.json
similarity index 100%
rename from trials/0/sweeps_trial_cag-hnsw.json
rename to sweeps/trials/0/sweeps_trial_cag-hnsw.json
diff --git a/trials/0/sweeps_trial_cag-search.json b/sweeps/trials/0/sweeps_trial_cag-search.json
similarity index 100%
rename from trials/0/sweeps_trial_cag-search.json
rename to sweeps/trials/0/sweeps_trial_cag-search.json
diff --git a/trials/0/sweeps_trial_luc-hnsw.json b/sweeps/trials/0/sweeps_trial_luc-hnsw.json
similarity index 100%
rename from trials/0/sweeps_trial_luc-hnsw.json
rename to sweeps/trials/0/sweeps_trial_luc-hnsw.json
diff --git a/trials/1/sweeps_10M_cag-hnsw.json b/sweeps/trials/1/sweeps_10M_cag-hnsw.json
similarity index 100%
rename from trials/1/sweeps_10M_cag-hnsw.json
rename to sweeps/trials/1/sweeps_10M_cag-hnsw.json
diff --git a/trials/1/sweeps_10M_luc-hnsw.json b/sweeps/trials/1/sweeps_10M_luc-hnsw.json
similarity index 100%
rename from trials/1/sweeps_10M_luc-hnsw.json
rename to sweeps/trials/1/sweeps_10M_luc-hnsw.json

From 0b642f6dce0d55967b9853a0d93f24863f15f5fd Mon Sep 17 00:00:00 2001
From: ManasSingh12345 <ManasSingh12345@users.noreply.github.com>
Date: Sun, 24 May 2026 04:09:02 +0000
Subject: [PATCH 3/6] Fix IVF-PQ sweep builds and export IVF-PQ params in CSV.

IVF-PQ index/search params affect CAGRA graph construction, so they must
trigger separate index builds rather than search-only skipIndexing runs.
Also set indexDirPath for the harness and add IVF-PQ columns to CSV export.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 export_results_csv.py    | 46 ++++++++++++++++++++++++++++++++++++++++
 generate-combinations.py | 15 +++++++------
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/export_results_csv.py b/export_results_csv.py
index ead2d1b..caa8edc 100644
--- a/export_results_csv.py
+++ b/export_results_csv.py
@@ -37,10 +37,27 @@
     "build GPU",
     "graph_degree",
     "intermediate_graph_degree",
+    "graph_build_algo",
+    "ivf_nlists",
+    "ivf_pq_dim",
+    "ivf_pq_bits",
+    "ivf_kmeans_iters",
+    "ivf_nprobes",
+    "ivf_refinement_rate",
     "label",
     "run_directory",
 ]
 
+# Maps CSV column names to keys in results.json configuration.
+IVF_PQ_CONFIG_KEYS = {
+    "ivf_nlists": "cuVSIvfPqIndexParamsNLists",
+    "ivf_pq_dim": "cuVSIvfPqIndexParamsPqDim",
+    "ivf_pq_bits": "cuVSIvfPqIndexParamsPqBits",
+    "ivf_kmeans_iters": "cuVSIvfPqIndexParamsKmeansNIters",
+    "ivf_nprobes": "cuVSIvfPqSearchParamsNProbes",
+    "ivf_refinement_rate": "cuVSIvfPqParamsRefinementRate",
+}
+
 
 def _metric(metrics: Dict[str, Any], suffix: str) -> Optional[float]:
     suffix_lower = suffix.lower()
@@ -94,6 +111,26 @@ def create_index_name(config: Dict[str, Any]) -> str:
     return f"ef{ef_search}"
 
 
+def _uses_ivf_pq_graph_build(config: Dict[str, Any]) -> bool:
+    return config.get("cagraGraphBuildAlgo") == "IVF_PQ"
+
+
+def ivf_pq_fields(config: Dict[str, Any]) -> Dict[str, Any]:
+    """Extract IVF-PQ sweep fields; use NaN when graph build is not IVF_PQ."""
+    graph_build_algo = config.get("cagraGraphBuildAlgo")
+    if not _uses_ivf_pq_graph_build(config):
+        return {
+            "graph_build_algo": graph_build_algo,
+            **{column: float("nan") for column in IVF_PQ_CONFIG_KEYS},
+        }
+
+    fields: Dict[str, Any] = {"graph_build_algo": graph_build_algo}
+    for column, config_key in IVF_PQ_CONFIG_KEYS.items():
+        value = config.get(config_key)
+        fields[column] = float("nan") if value is None else value
+    return fields
+
+
 def create_label(config: Dict[str, Any]) -> str:
     algo = config.get("algoToRun", "")
     parts = [
@@ -103,6 +140,14 @@ def create_label(config: Dict[str, Any]) -> str:
         f"ig={config.get('cagraIntermediateGraphDegree', '')}",
         f"qt={config.get('queryThreads', '')}",
     ]
+    if _uses_ivf_pq_graph_build(config):
+        parts.extend(
+            [
+                f"lists={config.get('cuVSIvfPqIndexParamsNLists', '')}",
+                f"probes={config.get('cuVSIvfPqSearchParamsNProbes', '')}",
+                f"refine={config.get('cuVSIvfPqParamsRefinementRate', '')}",
+            ]
+        )
     return f"{algo} " + " ".join(parts)
 
 
@@ -151,6 +196,7 @@ def row_from_results(results_path: str) -> Dict[str, Any]:
         "build GPU": build_time if build_time is not None else float("nan"),
         "graph_degree": config.get("cagraGraphDegree"),
         "intermediate_graph_degree": config.get("cagraIntermediateGraphDegree"),
+        **ivf_pq_fields(config),
         "label": create_label(config),
         "run_directory": config.get("resultsDirectory", results_path),
     }
diff --git a/generate-combinations.py b/generate-combinations.py
index cd7a792..d276277 100644
--- a/generate-combinations.py
+++ b/generate-combinations.py
@@ -28,6 +28,10 @@
 # All other variant parameters are treated as build-only, so the index is
 # shared across every search-param combination that belongs to the same
 # build-param combination.
+#
+# CuVS IVF-PQ params (index + search + refinement) are NOT listed here:
+# for CAGRA graph build algorithms (IVF_PQ), they are consumed when the
+# CAGRA graph is constructed, not at HNSW/CAGRA query time.
 SEARCH_ONLY_PARAMS = {
     'efSearch',
     'cagraITopK',
@@ -38,11 +42,6 @@
     'queryThreads',
     'numQueriesToRun',
     'numWarmUpQueries',
-    'cuVSIvfPqParamsRefinementRate',
-    'cuVSIvfPqSearchParamsNProbes',
-    'cuVSIvfPqSearchParamsInternalDistanceDtype',
-    'cuVSIvfPqSearchParamsLutDtype',
-    'cuVSIvfPqSearchParamsPreferredShmemCarveout',
     'filterRejectRate',
 }
 
@@ -134,10 +133,14 @@
                     config['cleanIndexDirectory'] = (search_idx == len(search_combinations) - 1)
 
                     # Point all configs in this build group at the same index directory.
+                    index_dir = f"cuvsIndex-{base_hash}"
                     if 'hnswIndexDirPath' in config:
                         config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
                     if 'cuvsIndexDirPath' in config:
-                        config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"
+                        config['cuvsIndexDirPath'] = index_dir
+                    # LuceneCuvsBenchmarks reads indexDirPath (not cuvsIndexDirPath).
+                    if 'indexDirPath' in config:
+                        config['indexDirPath'] = index_dir
 
                     filename = f"{algo}-{hash_id}.json"
                     sweep_dir = f"{args.configs_dir}/{sweep}"

From e7ad98a8ba520df973aff6b069dc0ae3b0df303f Mon Sep 17 00:00:00 2001
From: ManasSingh12345 <ManasSingh12345@users.noreply.github.com>
Date: Mon, 25 May 2026 02:27:38 +0000
Subject: [PATCH 4/6] Add IVF-PQ sweep definitions for 1M wiki benchmarks.

Include sensitivity, full CAGRA_HNSW grid, and refine=2 rerun configs; fix
CAGRA_SEARCH cagraHnswLayers in the base ivfpq sweep.
---
 sweeps/cagra_ivfpq_1M.json                  |  1 +
 sweeps/cagra_ivfpq_1M_full.json             | 42 ++++++++++++++
 sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json | 42 ++++++++++++++
 sweeps/cagra_ivfpq_1M_refine2_rerun.json    | 63 +++++++++++++++++++++
 4 files changed, 148 insertions(+)
 create mode 100644 sweeps/cagra_ivfpq_1M_full.json
 create mode 100644 sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json
 create mode 100644 sweeps/cagra_ivfpq_1M_refine2_rerun.json

diff --git a/sweeps/cagra_ivfpq_1M.json b/sweeps/cagra_ivfpq_1M.json
index 4a6894c..98d8aac 100644
--- a/sweeps/cagra_ivfpq_1M.json
+++ b/sweeps/cagra_ivfpq_1M.json
@@ -25,6 +25,7 @@
         "cagraGraphBuildAlgo": "IVF_PQ",
         "cagraGraphDegree": [32],
         "cagraIntermediateGraphDegree": [64],
+        "cagraHnswLayers": [1],
         "cuvsWriterThreads": 32,
         "numIndexThreads": 32,
         "cagraITopK": 15,
diff --git a/sweeps/cagra_ivfpq_1M_full.json b/sweeps/cagra_ivfpq_1M_full.json
new file mode 100644
index 0000000..51e76d6
--- /dev/null
+++ b/sweeps/cagra_ivfpq_1M_full.json
@@ -0,0 +1,42 @@
+{
+  "wiki1m_cagra_ivfpq_full": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [50, 100, 150],
+      "queryThreads": [1, 8, 64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_HNSW": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphBuildAlgo": "IVF_PQ",
+        "cagraGraphDegree": [16, 32, 64, 128],
+        "cagraIntermediateGraphDegree": [64, 128, 256, 512],
+        "cagraHnswLayers": 1,
+        "cuvsWriterThreads": 4,
+        "numIndexThreads": 4,
+        "hnswMaxConn": [32, 64],
+        "hnswBeamWidth": [128, 256],
+        "cuVSIvfPqIndexParamsNLists": 512,
+        "cuVSIvfPqIndexParamsPqDim": [0, 128],
+        "cuVSIvfPqIndexParamsPqBits": [4, 8],
+        "cuVSIvfPqIndexParamsKmeansNIters": 10,
+        "cuVSIvfPqSearchParamsNProbes": [10, 20, 50],
+        "cuVSIvfPqParamsRefinementRate": 1
+      }
+    }
+  }
+}
diff --git a/sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json b/sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json
new file mode 100644
index 0000000..d79acfe
--- /dev/null
+++ b/sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json
@@ -0,0 +1,42 @@
+{
+  "wiki1m_cagra_ivfpq_hnsw": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [100],
+      "queryThreads": [8],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_HNSW": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphBuildAlgo": "IVF_PQ",
+        "cagraGraphDegree": [32],
+        "cagraIntermediateGraphDegree": [64],
+        "cagraHnswLayers": [1],
+        "cuvsWriterThreads": 4,
+        "numIndexThreads": 4,
+        "hnswMaxConn": [32],
+        "hnswBeamWidth": [32],
+        "cuVSIvfPqIndexParamsNLists": [512, 2048],
+        "cuVSIvfPqIndexParamsPqDim": [0, 128],
+        "cuVSIvfPqIndexParamsPqBits": [4, 8],
+        "cuVSIvfPqIndexParamsKmeansNIters": [10, 25],
+        "cuVSIvfPqSearchParamsNProbes": [10, 50],
+        "cuVSIvfPqParamsRefinementRate": [1, 2]
+      }
+    }
+  }
+}
diff --git a/sweeps/cagra_ivfpq_1M_refine2_rerun.json b/sweeps/cagra_ivfpq_1M_refine2_rerun.json
new file mode 100644
index 0000000..f19cebf
--- /dev/null
+++ b/sweeps/cagra_ivfpq_1M_refine2_rerun.json
@@ -0,0 +1,63 @@
+{
+  "wiki1m_cagra_ivfpq": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [50, 100, 150],
+      "queryThreads": [1, 8, 64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_SEARCH": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphBuildAlgo": "IVF_PQ",
+        "cagraGraphDegree": [32],
+        "cagraIntermediateGraphDegree": [64],
+        "cagraHnswLayers": [1],
+        "cuvsWriterThreads": 4,
+        "numIndexThreads": 4,
+        "cagraITopK": 15,
+        "cagraSearchWidth": [4, 16],
+        "cagraSearchAlgo": "SINGLE_CTA",
+        "cagraThreadBlockSize": [0],
+        "cuVSIvfPqIndexParamsConservativeMemoryAllocation": true,
+        "cuVSIvfPqIndexParamsNLists": [1024, 2048],
+        "cuVSIvfPqIndexParamsPqDim": [0],
+        "cuVSIvfPqIndexParamsPqBits": [8],
+        "cuVSIvfPqIndexParamsKmeansNIters": [25],
+        "cuVSIvfPqSearchParamsNProbes": [10, 20, 50],
+        "cuVSIvfPqParamsRefinementRate": [1, 2]
+      },
+      "CAGRA_HNSW": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphBuildAlgo": "IVF_PQ",
+        "cagraGraphDegree": [32],
+        "cagraIntermediateGraphDegree": [64],
+        "cagraHnswLayers": [1],
+        "cuvsWriterThreads": 4,
+        "numIndexThreads": 4,
+        "hnswMaxConn": [32],
+        "hnswBeamWidth": [32],
+        "cuVSIvfPqIndexParamsConservativeMemoryAllocation": true,
+        "cuVSIvfPqIndexParamsNLists": [1024, 2048],
+        "cuVSIvfPqIndexParamsPqDim": [0],
+        "cuVSIvfPqIndexParamsPqBits": [8],
+        "cuVSIvfPqIndexParamsKmeansNIters": [25],
+        "cuVSIvfPqSearchParamsNProbes": [10, 20, 50],
+        "cuVSIvfPqParamsRefinementRate": [2]
+      }
+    }
+  }
+}

From cc080f10ec81f316e368c598a4737a47dc888016 Mon Sep 17 00:00:00 2001
From: ManasSingh12345 <ManasSingh12345@users.noreply.github.com>
Date: Mon, 25 May 2026 02:36:00 +0000
Subject: [PATCH 5/6] Align 1M sweep search grids and add Lucene HNSW full
 sweep.

Use the same efSearch union on CAGRA IVF-PQ and Lucene HNSW so recall
and throughput Pareto comparisons share search parameters.
---
 sweeps/cagra_ivfpq_1M_full.json |  2 +-
 sweeps/lucene_hnsw_1M_full.json | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 sweeps/lucene_hnsw_1M_full.json

diff --git a/sweeps/cagra_ivfpq_1M_full.json b/sweeps/cagra_ivfpq_1M_full.json
index 51e76d6..9557db4 100644
--- a/sweeps/cagra_ivfpq_1M_full.json
+++ b/sweeps/cagra_ivfpq_1M_full.json
@@ -7,7 +7,7 @@
       "numWarmUpQueries": 5000,
       "flushFreq": 10000000,
       "topK": 100,
-      "efSearch": [50, 100, 150],
+      "efSearch": [50, 100, 120, 150, 200, 400, 600, 800],
       "queryThreads": [1, 8, 64],
       "vectorColName": "vector",
       "createIndexInMemory": false,
diff --git a/sweeps/lucene_hnsw_1M_full.json b/sweeps/lucene_hnsw_1M_full.json
new file mode 100644
index 0000000..eb31c74
--- /dev/null
+++ b/sweeps/lucene_hnsw_1M_full.json
@@ -0,0 +1,31 @@
+{
+  "wiki1m_lucene_hnsw_full": {
+    "dataset": "wiki_all_1M",
+    "common-params": {
+      "numDocs": 1000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 100,
+      "efSearch": [50, 100, 120, 150, 200, 400, 600, 800],
+      "queryThreads": [1, 8, 64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 0,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "LUCENE_HNSW": {
+        "indexDirPath": "hnswIndex",
+        "hnswMaxConn": [12, 16, 24, 32, 36, 64],
+        "hnswBeamWidth": [64, 128, 256, 512],
+        "hnswMergeThreads": 32,
+        "numIndexThreads": 32
+      }
+    }
+  }
+}

From c9662c3f588225e59483bd8536e2a31dd21376bf Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-37-14.ec2.internal>
Date: Mon, 25 May 2026 21:57:03 +0000
Subject: [PATCH 6/6] Add ads 10M sweep configs and prune obsolete sweep files.

Introduce CAGRA IVF-PQ and Lucene HNSW grids for ads_sourcing-10m at 1536d and 96d, update sweeps README, and remove one-off retest and historical trial sweep definitions.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 sweeps/README.md                              | 14 +++--
 sweeps/ads10m_cagra_ivfpq_1536d.json          | 46 ++++++++++++++
 sweeps/ads10m_cagra_ivfpq_96d.json            | 46 ++++++++++++++
 ..._1M.json => ads10m_lucene_hnsw_1536d.json} | 18 +++---
 ...se_1M.json => ads10m_lucene_hnsw_96d.json} | 18 +++---
 sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json   | 42 -------------
 sweeps/cagra_ivfpq_1M_refine2_rerun.json      | 63 -------------------
 sweeps/cagra_search_sw128_retest_1M.json      | 38 -----------
 .../0/sweeps_trial_cag-hnsw-binary.json       | 32 ----------
 .../0/sweeps_trial_cag-hnsw-scalar.json       | 32 ----------
 sweeps/trials/0/sweeps_trial_cag-hnsw.json    | 32 ----------
 sweeps/trials/0/sweeps_trial_cag-search.json  | 34 ----------
 sweeps/trials/0/sweeps_trial_luc-hnsw.json    | 30 ---------
 sweeps/trials/1/sweeps_10M_cag-hnsw.json      | 32 ----------
 sweeps/trials/1/sweeps_10M_luc-hnsw.json      | 30 ---------
 15 files changed, 118 insertions(+), 389 deletions(-)
 create mode 100644 sweeps/ads10m_cagra_ivfpq_1536d.json
 create mode 100644 sweeps/ads10m_cagra_ivfpq_96d.json
 rename sweeps/{lucene_hnsw_test_1M.json => ads10m_lucene_hnsw_1536d.json} (66%)
 rename sweeps/{hnswlib_base_1M.json => ads10m_lucene_hnsw_96d.json} (64%)
 delete mode 100644 sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json
 delete mode 100644 sweeps/cagra_ivfpq_1M_refine2_rerun.json
 delete mode 100644 sweeps/cagra_search_sw128_retest_1M.json
 delete mode 100644 sweeps/trials/0/sweeps_trial_cag-hnsw-binary.json
 delete mode 100644 sweeps/trials/0/sweeps_trial_cag-hnsw-scalar.json
 delete mode 100644 sweeps/trials/0/sweeps_trial_cag-hnsw.json
 delete mode 100644 sweeps/trials/0/sweeps_trial_cag-search.json
 delete mode 100644 sweeps/trials/0/sweeps_trial_luc-hnsw.json
 delete mode 100644 sweeps/trials/1/sweeps_10M_cag-hnsw.json
 delete mode 100644 sweeps/trials/1/sweeps_10M_luc-hnsw.json

diff --git a/sweeps/README.md b/sweeps/README.md
index bad2598..143761e 100644
--- a/sweeps/README.md
+++ b/sweeps/README.md
@@ -4,15 +4,17 @@ Pass any file here to `run_sweep.sh` with `--sweeps sweeps/<file>.json`.
 
 | File | Description |
 |------|-------------|
-| `default.json` | Upstream default sweep |
+| `default.json` | Upstream default sweep (wiki-10m, sift-1m) |
 | `test_1M.json` | Small 1M wiki smoke (CAGRA_SEARCH + CAGRA_HNSW) |
 | `cagra_search_grid_1M.json` | Full CAGRA_SEARCH NN_DESCENT grid (wiki 1M) |
 | `cagra_search_grid.json` | CAGRA_SEARCH grid (10M dataset name) |
-| `cagra_search_sw128_retest_1M.json` | Single sw128 failure retest |
-| `hnswlib_base_1M.json` | LUCENE_HNSW hnswlib-style grid |
-| `lucene_hnsw_test_1M.json` | Single LUCENE_HNSW smoke |
-| `cagra_ivfpq_1M.json` | CAGRA_SEARCH + CAGRA_HNSW, IVF_PQ build |
-| `trials/0/`, `trials/1/` | Historical trial sweeps |
+| `cagra_ivfpq_1M.json` | CAGRA_SEARCH + CAGRA_HNSW, IVF_PQ build (wiki 1M) |
+| `cagra_ivfpq_1M_full.json` | Full CAGRA_HNSW IVF_PQ grid (wiki 1M) |
+| `lucene_hnsw_1M_full.json` | Full LUCENE_HNSW grid (wiki 1M) |
+| `ads10m_cagra_ivfpq_1536d.json` | CAGRA_HNSW IVF_PQ grid (ads 10M, 1536d, topK=200) |
+| `ads10m_lucene_hnsw_1536d.json` | LUCENE_HNSW grid (ads 10M, 1536d, topK=200) |
+| `ads10m_cagra_ivfpq_96d.json` | CAGRA_HNSW IVF_PQ grid (ads 10M, 96d, topK=1500) |
+| `ads10m_lucene_hnsw_96d.json` | LUCENE_HNSW grid (ads 10M, 96d, topK=1500) |
 
 Example:
 
diff --git a/sweeps/ads10m_cagra_ivfpq_1536d.json b/sweeps/ads10m_cagra_ivfpq_1536d.json
new file mode 100644
index 0000000..24e9e1b
--- /dev/null
+++ b/sweeps/ads10m_cagra_ivfpq_1536d.json
@@ -0,0 +1,46 @@
+{
+  "ads10m_cagra_ivfpq": {
+    "dataset": "ads_sourcing-10m",
+    "common-params": {
+      "numDocs": 10000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 200,
+      "efSearch": [200, 400, 600, 800, 1000],
+      "queryThreads": [64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 1,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_HNSW": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphBuildAlgo": "IVF_PQ",
+        "cagraGraphDegree": [64, 128],
+        "cagraIntermediateGraphDegree": [128, 256],
+        "cagraHnswLayers": 1,
+        "cuvsWriterThreads": 4,
+        "numIndexThreads": 4,
+        "hnswMaxConn": 64,
+        "hnswBeamWidth": 128,
+        "cuVSIvfPqIndexParamsNLists": 5000,
+        "cuVSIvfPqIndexParamsPqDim": [384, 768],
+        "cuVSIvfPqIndexParamsPqBits": [4],
+        "cuVSIvfPqIndexParamsKmeansNIters": 10,
+        "cuVSIvfPqIndexParamsKmeansTrainsetFraction": 0.1,
+        "cuVSIvfPqIndexParamsConservativeMemoryAllocation": true,
+        "cuVSIvfPqSearchParamsNProbes": [10, 20],
+        "cuVSIvfPqSearchParamsLutDtype": "CUDA_R_16F",
+        "cuVSIvfPqSearchParamsInternalDistanceDtype": "CUDA_R_16F",
+        "cuVSIvfPqParamsRefinementRate": 1
+      }
+    }
+  }
+}
diff --git a/sweeps/ads10m_cagra_ivfpq_96d.json b/sweeps/ads10m_cagra_ivfpq_96d.json
new file mode 100644
index 0000000..98de269
--- /dev/null
+++ b/sweeps/ads10m_cagra_ivfpq_96d.json
@@ -0,0 +1,46 @@
+{
+  "ads10m_cagra_ivfpq": {
+    "dataset": "ads_sourcing-10m",
+    "common-params": {
+      "numDocs": 10000000,
+      "numQueriesToRun": 10000,
+      "numWarmUpQueries": 5000,
+      "flushFreq": 10000000,
+      "topK": 1500,
+      "efSearch": [1500, 2000, 2500, 3000, 4000],
+      "queryThreads": [64],
+      "vectorColName": "vector",
+      "createIndexInMemory": false,
+      "cleanIndexDirectory": true,
+      "saveResultsOnDisk": true,
+      "forceMerge": 1,
+      "enableTieredMerge": true,
+      "enableIndexWriterInfoStream": false,
+      "cuvsWorkspacePoolSize": 4194304,
+      "filterRejectRate": 0.0
+    },
+    "algorithms": {
+      "CAGRA_HNSW": {
+        "indexDirPath": "cuvsIndex",
+        "cagraGraphBuildAlgo": "IVF_PQ",
+        "cagraGraphDegree": [64, 128],
+        "cagraIntermediateGraphDegree": [128, 256],
+        "cagraHnswLayers": 1,
+        "cuvsWriterThreads": 4,
+        "numIndexThreads": 4,
+        "hnswMaxConn": 64,
+        "hnswBeamWidth": 128,
+        "cuVSIvfPqIndexParamsNLists": 5000,
+        "cuVSIvfPqIndexParamsPqDim": [32, 64],
+        "cuVSIvfPqIndexParamsPqBits": [4],
+        "cuVSIvfPqIndexParamsKmeansNIters": 10,
+        "cuVSIvfPqIndexParamsKmeansTrainsetFraction": 0.1,
+        "cuVSIvfPqIndexParamsConservativeMemoryAllocation": true,
+        "cuVSIvfPqSearchParamsNProbes": [10, 20],
+        "cuVSIvfPqSearchParamsLutDtype": "CUDA_R_16F",
+        "cuVSIvfPqSearchParamsInternalDistanceDtype": "CUDA_R_16F",
+        "cuVSIvfPqParamsRefinementRate": 1
+      }
+    }
+  }
+}
diff --git a/sweeps/lucene_hnsw_test_1M.json b/sweeps/ads10m_lucene_hnsw_1536d.json
similarity index 66%
rename from sweeps/lucene_hnsw_test_1M.json
rename to sweeps/ads10m_lucene_hnsw_1536d.json
index 440dd0d..719380b 100644
--- a/sweeps/lucene_hnsw_test_1M.json
+++ b/sweeps/ads10m_lucene_hnsw_1536d.json
@@ -1,19 +1,19 @@
 {
-  "wiki1m_lucene_hnsw_test": {
-    "dataset": "wiki_all_1M",
+  "ads10m_lucene_hnsw": {
+    "dataset": "ads_sourcing-10m",
     "common-params": {
-      "numDocs": 1000000,
+      "numDocs": 10000000,
       "numQueriesToRun": 10000,
       "numWarmUpQueries": 5000,
       "flushFreq": 10000000,
-      "topK": 100,
-      "efSearch": [100],
-      "queryThreads": [8],
+      "topK": 200,
+      "efSearch": [200, 400, 600, 800, 1000],
+      "queryThreads": [64],
       "vectorColName": "vector",
       "createIndexInMemory": false,
       "cleanIndexDirectory": true,
       "saveResultsOnDisk": true,
-      "forceMerge": 0,
+      "forceMerge": 1,
       "enableTieredMerge": true,
       "enableIndexWriterInfoStream": false,
       "filterRejectRate": 0.0
@@ -21,8 +21,8 @@
     "algorithms": {
       "LUCENE_HNSW": {
         "indexDirPath": "hnswIndex",
-        "hnswMaxConn": [32],
-        "hnswBeamWidth": [32],
+        "hnswMaxConn": [32, 64],
+        "hnswBeamWidth": 128,
         "hnswMergeThreads": 32,
         "numIndexThreads": 32
       }
diff --git a/sweeps/hnswlib_base_1M.json b/sweeps/ads10m_lucene_hnsw_96d.json
similarity index 64%
rename from sweeps/hnswlib_base_1M.json
rename to sweeps/ads10m_lucene_hnsw_96d.json
index a1abcde..1cb36d1 100644
--- a/sweeps/hnswlib_base_1M.json
+++ b/sweeps/ads10m_lucene_hnsw_96d.json
@@ -1,19 +1,19 @@
 {
-  "wiki1m_hnswlib_base": {
-    "dataset": "wiki_all_1M",
+  "ads10m_lucene_hnsw": {
+    "dataset": "ads_sourcing-10m",
     "common-params": {
-      "numDocs": 1000000,
+      "numDocs": 10000000,
       "numQueriesToRun": 10000,
       "numWarmUpQueries": 5000,
       "flushFreq": 10000000,
-      "topK": 100,
-      "efSearch": [100, 120, 200, 400, 600, 800],
-      "queryThreads": [1, 8, 64],
+      "topK": 1500,
+      "efSearch": [1500, 2000, 2500, 3000, 4000],
+      "queryThreads": [64],
       "vectorColName": "vector",
       "createIndexInMemory": false,
       "cleanIndexDirectory": true,
       "saveResultsOnDisk": true,
-      "forceMerge": 0,
+      "forceMerge": 1,
       "enableTieredMerge": true,
       "enableIndexWriterInfoStream": false,
       "filterRejectRate": 0.0
@@ -21,8 +21,8 @@
     "algorithms": {
       "LUCENE_HNSW": {
         "indexDirPath": "hnswIndex",
-        "hnswMaxConn": [12, 16, 24, 36],
-        "hnswBeamWidth": [64, 128, 256, 512],
+        "hnswMaxConn": [32, 64],
+        "hnswBeamWidth": 128,
         "hnswMergeThreads": 32,
         "numIndexThreads": 32
       }
diff --git a/sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json b/sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json
deleted file mode 100644
index d79acfe..0000000
--- a/sweeps/cagra_ivfpq_1M_hnsw_sensitivity.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
-  "wiki1m_cagra_ivfpq_hnsw": {
-    "dataset": "wiki_all_1M",
-    "common-params": {
-      "numDocs": 1000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 5000,
-      "flushFreq": 10000000,
-      "topK": 100,
-      "efSearch": [100],
-      "queryThreads": [8],
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false,
-      "cuvsWorkspacePoolSize": 4194304,
-      "filterRejectRate": 0.0
-    },
-    "algorithms": {
-      "CAGRA_HNSW": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphBuildAlgo": "IVF_PQ",
-        "cagraGraphDegree": [32],
-        "cagraIntermediateGraphDegree": [64],
-        "cagraHnswLayers": [1],
-        "cuvsWriterThreads": 4,
-        "numIndexThreads": 4,
-        "hnswMaxConn": [32],
-        "hnswBeamWidth": [32],
-        "cuVSIvfPqIndexParamsNLists": [512, 2048],
-        "cuVSIvfPqIndexParamsPqDim": [0, 128],
-        "cuVSIvfPqIndexParamsPqBits": [4, 8],
-        "cuVSIvfPqIndexParamsKmeansNIters": [10, 25],
-        "cuVSIvfPqSearchParamsNProbes": [10, 50],
-        "cuVSIvfPqParamsRefinementRate": [1, 2]
-      }
-    }
-  }
-}
diff --git a/sweeps/cagra_ivfpq_1M_refine2_rerun.json b/sweeps/cagra_ivfpq_1M_refine2_rerun.json
deleted file mode 100644
index f19cebf..0000000
--- a/sweeps/cagra_ivfpq_1M_refine2_rerun.json
+++ /dev/null
@@ -1,63 +0,0 @@
-{
-  "wiki1m_cagra_ivfpq": {
-    "dataset": "wiki_all_1M",
-    "common-params": {
-      "numDocs": 1000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 5000,
-      "flushFreq": 10000000,
-      "topK": 100,
-      "efSearch": [50, 100, 150],
-      "queryThreads": [1, 8, 64],
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false,
-      "cuvsWorkspacePoolSize": 4194304,
-      "filterRejectRate": 0.0
-    },
-    "algorithms": {
-      "CAGRA_SEARCH": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphBuildAlgo": "IVF_PQ",
-        "cagraGraphDegree": [32],
-        "cagraIntermediateGraphDegree": [64],
-        "cagraHnswLayers": [1],
-        "cuvsWriterThreads": 4,
-        "numIndexThreads": 4,
-        "cagraITopK": 15,
-        "cagraSearchWidth": [4, 16],
-        "cagraSearchAlgo": "SINGLE_CTA",
-        "cagraThreadBlockSize": [0],
-        "cuVSIvfPqIndexParamsConservativeMemoryAllocation": true,
-        "cuVSIvfPqIndexParamsNLists": [1024, 2048],
-        "cuVSIvfPqIndexParamsPqDim": [0],
-        "cuVSIvfPqIndexParamsPqBits": [8],
-        "cuVSIvfPqIndexParamsKmeansNIters": [25],
-        "cuVSIvfPqSearchParamsNProbes": [10, 20, 50],
-        "cuVSIvfPqParamsRefinementRate": [1, 2]
-      },
-      "CAGRA_HNSW": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphBuildAlgo": "IVF_PQ",
-        "cagraGraphDegree": [32],
-        "cagraIntermediateGraphDegree": [64],
-        "cagraHnswLayers": [1],
-        "cuvsWriterThreads": 4,
-        "numIndexThreads": 4,
-        "hnswMaxConn": [32],
-        "hnswBeamWidth": [32],
-        "cuVSIvfPqIndexParamsConservativeMemoryAllocation": true,
-        "cuVSIvfPqIndexParamsNLists": [1024, 2048],
-        "cuVSIvfPqIndexParamsPqDim": [0],
-        "cuVSIvfPqIndexParamsPqBits": [8],
-        "cuVSIvfPqIndexParamsKmeansNIters": [25],
-        "cuVSIvfPqSearchParamsNProbes": [10, 20, 50],
-        "cuVSIvfPqParamsRefinementRate": [2]
-      }
-    }
-  }
-}
diff --git a/sweeps/cagra_search_sw128_retest_1M.json b/sweeps/cagra_search_sw128_retest_1M.json
deleted file mode 100644
index c9b6b05..0000000
--- a/sweeps/cagra_search_sw128_retest_1M.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-  "wiki1m_sw128_retest": {
-    "dataset": "wiki_all_1M",
-    "common-params": {
-      "numDocs": 1000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 5000,
-      "flushFreq": 10000000,
-      "topK": 100,
-      "efSearch": [100],
-      "queryThreads": [8],
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false,
-      "cuvsWorkspacePoolSize": 4194304,
-      "filterRejectRate": 0.0
-    },
-    "algorithms": {
-      "CAGRA_SEARCH": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphDegree": [64],
-        "cagraIntermediateGraphDegree": [128],
-        "cagraHnswLayers": [1],
-        "cagraGraphBuildAlgo": "NN_DESCENT",
-        "cuvsWriterThreads": 32,
-        "numIndexThreads": 32,
-        "cagraITopK": 15,
-        "cagraSearchWidth": [128],
-        "cagraSearchAlgo": "SINGLE_CTA",
-        "cagraThreadBlockSize": [0]
-      }
-    }
-  }
-}
diff --git a/sweeps/trials/0/sweeps_trial_cag-hnsw-binary.json b/sweeps/trials/0/sweeps_trial_cag-hnsw-binary.json
deleted file mode 100644
index 3b31ff4..0000000
--- a/sweeps/trials/0/sweeps_trial_cag-hnsw-binary.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "wiki1m": {
-    "dataset": "wiki-1m",
-    "common-params": {
-      "numDocs": 1000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 20,
-      "flushFreq": 500000,
-      "topK": 100,
-      "efSearch": 150,
-      "queryThreads": 30,
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false
-    },
-    "algorithms": {
-      "CAGRA_HNSW_BINARY": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphDegree": [96],
-        "cagraIntermediateGraphDegree": [96],
-        "cagraHnswLayers": [1],
-        "cagraGraphBuildAlgo": "NN_DESCENT",
-        "cuvsWriterThreads": 30,
-        "numIndexThreads": 30
-      }
-    }
-  }
-}
diff --git a/sweeps/trials/0/sweeps_trial_cag-hnsw-scalar.json b/sweeps/trials/0/sweeps_trial_cag-hnsw-scalar.json
deleted file mode 100644
index a244304..0000000
--- a/sweeps/trials/0/sweeps_trial_cag-hnsw-scalar.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "wiki1m": {
-    "dataset": "wiki-1m",
-    "common-params": {
-      "numDocs": 1000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 20,
-      "flushFreq": 500000,
-      "topK": 100,
-      "efSearch": 150,
-      "queryThreads": 30,
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false
-    },
-    "algorithms": {
-      "CAGRA_HNSW_SCALAR": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphDegree": [96],
-        "cagraIntermediateGraphDegree": [96],
-        "cagraHnswLayers": [1],
-        "cagraGraphBuildAlgo": "NN_DESCENT",
-        "cuvsWriterThreads": 30,
-        "numIndexThreads": 30
-      }
-    }
-  }
-}
diff --git a/sweeps/trials/0/sweeps_trial_cag-hnsw.json b/sweeps/trials/0/sweeps_trial_cag-hnsw.json
deleted file mode 100644
index 97fb310..0000000
--- a/sweeps/trials/0/sweeps_trial_cag-hnsw.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "wiki1m": {
-    "dataset": "wiki-1m",
-    "common-params": {
-      "numDocs": 1000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 20,
-      "flushFreq": 500000,
-      "topK": 100,
-      "efSearch": 150,
-      "queryThreads": 30,
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false
-    },
-    "algorithms": {
-      "CAGRA_HNSW": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphDegree": [96],
-        "cagraIntermediateGraphDegree": [96],
-        "cagraHnswLayers": [1],
-        "cagraGraphBuildAlgo": "NN_DESCENT",
-        "cuvsWriterThreads": 30,
-        "numIndexThreads": 30
-      }
-    }
-  }
-}
diff --git a/sweeps/trials/0/sweeps_trial_cag-search.json b/sweeps/trials/0/sweeps_trial_cag-search.json
deleted file mode 100644
index 12b7fef..0000000
--- a/sweeps/trials/0/sweeps_trial_cag-search.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "wiki1m": {
-    "dataset": "wiki-1m",
-    "common-params": {
-      "numDocs": 1000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 20,
-      "flushFreq": 500000,
-      "topK": 100,
-      "efSearch": 150,
-      "queryThreads": 30,
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false
-    },
-    "algorithms": {
-      "CAGRA_SEARCH": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphDegree": [96],
-        "cagraIntermediateGraphDegree": [96],
-        "cagraHnswLayers": [1],
-        "cagraGraphBuildAlgo": "NN_DESCENT",
-        "cuvsWriterThreads": 30,
-        "numIndexThreads": 30,
-        "cagraITopK": 100,
-        "cagraSearchWidth": 5
-      }
-    }
-  }
-}
diff --git a/sweeps/trials/0/sweeps_trial_luc-hnsw.json b/sweeps/trials/0/sweeps_trial_luc-hnsw.json
deleted file mode 100644
index 8c9db18..0000000
--- a/sweeps/trials/0/sweeps_trial_luc-hnsw.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "wiki1m": {
-    "dataset": "wiki-1m",
-    "common-params": {
-      "numDocs": 1000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 20,
-      "flushFreq": 500000,
-      "topK": 100,
-      "efSearch": 150,
-      "queryThreads": 30,
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false
-    },
-    "algorithms": {
-      "LUCENE_HNSW": {
-        "indexDirPath": "hnswIndex",
-        "hnswMaxConn": [128],
-        "hnswBeamWidth": [128],
-        "hnswMergeThreads": 30,
-        "numIndexThreads": 30
-      }
-    }
-  }
-}
\ No newline at end of file
diff --git a/sweeps/trials/1/sweeps_10M_cag-hnsw.json b/sweeps/trials/1/sweeps_10M_cag-hnsw.json
deleted file mode 100644
index cbbf131..0000000
--- a/sweeps/trials/1/sweeps_10M_cag-hnsw.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "wiki10m": {
-    "dataset": "wiki-10m",
-    "common-params": {
-      "numDocs": 10000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 20,
-      "flushFreq": 100000,
-      "topK": 100,
-      "efSearch": 100,
-      "queryThreads": 30,
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false
-    },
-    "algorithms": {
-      "CAGRA_HNSW": {
-        "indexDirPath": "cuvsIndex",
-        "cagraGraphDegree": [8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96],
-        "cagraIntermediateGraphDegree": [8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96],
-        "cagraHnswLayers": [1],
-        "cagraGraphBuildAlgo": "NN_DESCENT",
-        "cuvsWriterThreads": 30,
-        "numIndexThreads": 30
-      }
-    }
-  }
-}
diff --git a/sweeps/trials/1/sweeps_10M_luc-hnsw.json b/sweeps/trials/1/sweeps_10M_luc-hnsw.json
deleted file mode 100644
index 83825dc..0000000
--- a/sweeps/trials/1/sweeps_10M_luc-hnsw.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "wiki10m": {
-    "dataset": "wiki-10m",
-    "common-params": {
-      "numDocs": 10000000,
-      "numQueriesToRun": 10000,
-      "numWarmUpQueries": 20,
-      "flushFreq": 100000,
-      "topK": 100,
-      "efSearch": 100,
-      "queryThreads": 30,
-      "vectorColName": "vector",
-      "createIndexInMemory": false,
-      "cleanIndexDirectory": true,
-      "saveResultsOnDisk": true,
-      "forceMerge": 0,
-      "enableTieredMerge": true,
-      "enableIndexWriterInfoStream": false
-    },
-    "algorithms": {
-      "LUCENE_HNSW": {
-        "indexDirPath": "hnswIndex",
-        "hnswMaxConn": [64, 128, 192, 256, 320, 384, 448, 512],
-        "hnswBeamWidth": [64, 128, 192, 256, 320, 384, 448, 512],
-        "numIndexThreads": 30,
-        "hnswMergeThreads": 30
-      }
-    }
-  }
-}
\ No newline at end of file