datastax · jshook · Feb 20, 2026 · Feb 20, 2026
@@ -132,7 +132,7 @@ public static void main(String[] args) throws IOException {
                 try {
                     DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
                             () -> new RuntimeException("Dataset " + datasetName + " not found")
-                    );
+                    ).getDataSet();
                     logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size());
 
                     String normalizedDatasetName = datasetName;

@@ -93,7 +93,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List<Func
         for (var datasetName : datasetNames) {
             DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
                     () -> new RuntimeException("Dataset " + datasetName + " not found")
-            );
+            ).getDataSet();
             Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
         }
     }

@@ -120,7 +120,7 @@ public static void main(String[] args) throws IOException {
             String datasetName = config.dataset;
             DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
                     () -> new RuntimeException("Could not load dataset:" + datasetName)
-            );
+            ).getDataSet();
             // Register dataset info the first time we actually load the dataset for benchmarking
             artifacts.registerDataset(datasetName, ds);
 

@@ -38,7 +38,8 @@ public static void main(String[] args) throws IOException {
 
         // Load dataset
         var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
-                .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"));
+                .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"))
+                .getDataSet();
 
         // Run artifacts + selections (sys_info/dataset_info/experiments.csv)
         RunArtifacts artifacts = RunArtifacts.open(runCfg, List.of(config));

@@ -0,0 +1,109 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.example.benchmarks.datasets;
+
+import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+
+import java.util.function.Supplier;
+
+/// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data.
+///
+/// Metadata such as the dataset name and similarity function are available immediately
+/// without any I/O, while the expensive work of reading vectors, deduplicating, scrubbing
+/// zero vectors, and normalizing is deferred until the first call to {@link #getDataSet()}.
+///
+/// This design allows callers to enumerate or filter available datasets cheaply, and
+/// ensures that the full load-and-scrub pipeline runs at most once per handle thanks to
+/// thread-safe caching.
+///
+/// Instances are created by {@link DataSetLoader} implementations; callers obtain them
+/// through {@link DataSets#loadDataSet(String)}.
+///
+/// ### Typical usage
+/// ```java
+/// DataSetInfo info = DataSets.loadDataSet("ada002-100k").orElseThrow();
+///
+/// // Cheap — no vectors loaded yet
+/// System.out.println(info.getName());
+/// System.out.println(info.getSimilarityFunction());
+///
+/// // First call triggers full load; subsequent calls return the cached DataSet
+/// DataSet ds = info.getDataSet();
+/// ```
+///
+/// @see DataSet
+/// @see DataSetLoader
+/// @see DataSets
+public class DataSetInfo {
+    private final String name;
+    private final VectorSimilarityFunction similarityFunction;
+    private final Supplier<DataSet> loader;
+    private volatile DataSet cached;
+
+    /// Creates a new dataset info handle.
+    ///
+    /// The supplied {@code loader} will not be invoked until {@link #getDataSet()} is called.
+    /// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates /
+    /// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}.
+    ///
+    /// @param name               the dataset name, used for display and lookup
+    /// @param similarityFunction the vector similarity function for this dataset
+    ///                           (e.g. {@link VectorSimilarityFunction#COSINE})
+    /// @param loader             a supplier that performs the deferred load; invoked at most once
+    public DataSetInfo(String name, VectorSimilarityFunction similarityFunction, Supplier<DataSet> loader) {
+        this.name = name;
+        this.similarityFunction = similarityFunction;
+        this.loader = loader;
+    }
+
+    /// Returns the dataset name.
+    ///
+    /// This is always available without triggering a data load.
+    public String getName() {
+        return name;
+    }
+
+    /// Returns the similarity function for this dataset.
+    ///
+    /// This is always available without triggering a data load.
+    /// For MFD datasets this is always {@link VectorSimilarityFunction#COSINE};
+    /// for HDF5 datasets it is inferred from the filename (e.g. {@code -angular} or {@code -euclidean}).
+    public VectorSimilarityFunction getSimilarityFunction() {
+        return similarityFunction;
+    }
+
+    /// Returns the fully loaded and scrubbed {@link DataSet}.
+    ///
+    /// On the first invocation this triggers the deferred load pipeline, which may involve
+    /// reading large vector files from disk, deduplication, zero-vector removal, and
+    /// normalization. The result is cached so that subsequent calls return immediately.
+    ///
+    /// This method is thread-safe: concurrent callers will block until the first load
+    /// completes, after which all callers share the same cached instance.
+    ///
+    /// @return the ready-to-use {@link DataSet}
+    public DataSet getDataSet() {
+        if (cached == null) {
+            synchronized (this) {
+                if (cached == null) {
+                    cached = loader.get();
+                }
+            }
+        }
+        return cached;
+    }
+}
@@ -23,19 +23,28 @@
  */
 public interface DataSetLoader {
     /**
-     * Implementations of this method <EM>MUST NOT</EM> throw exceptions related to the presence or absence of a
+     * Looks up a dataset by name and returns a lightweight {@link DataSetInfo} handle.
+     *
+     * <p>The returned handle provides the dataset name and similarity function immediately,
+     * without loading vector data into memory. The full {@link DataSet} (vectors, ground truth,
+     * etc.) is loaded lazily on the first call to {@link DataSetInfo#getDataSet()}.
+     *
+     * <p>Implementations <em>MUST NOT</em> throw exceptions related to the presence or absence of a
      * requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with
-     * exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably
-     * return from this method, avoiding any {@link System#exit(int)} or similar calls.
+     * exceptions as usual, including any errors downloading or preparing a dataset which has been found.
+     * Implementors should reliably return from this method, avoiding any {@link System#exit(int)} or similar calls.
+     *
+     * <p>Implementations may perform file downloads or other preparation work before returning the handle,
+     * but should defer the expensive parsing and scrubbing of vector data to the {@link DataSetInfo} supplier.
      *
      * <HR/>
      *
-     * Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
+     * <p>Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
      * not found, and info level for when datasets are found and loaded. This can assist users troubleshooting
      * diverse data sources.
      *
-     * @param dataSetName
-     * @return a {@link DataSet}, if found
+     * @param dataSetName the logical dataset name (not a filename; do not include extensions like {@code .hdf5})
+     * @return a {@link DataSetInfo} handle for the dataset, if found
      */
-    Optional<DataSet> loadDataSet(String dataSetName);
+    Optional<DataSetInfo> loadDataSet(String dataSetName);
 }
@@ -48,8 +48,11 @@ public class DataSetLoaderHDF5 implements DataSetLoader {
     /**
      * {@inheritDoc}
      */
-    public Optional<DataSet> loadDataSet(String datasetName) {
-        return maybeDownloadHdf5(datasetName).map(this::readHdf5Data);
+    public Optional<DataSetInfo> loadDataSet(String datasetName) {
+        return maybeDownloadHdf5(datasetName).map(path -> {
+            VectorSimilarityFunction similarity = getVectorSimilarityFunction(path);
+            return new DataSetInfo(datasetName, similarity, () -> readHdf5Data(path));
+        });
     }
 
     private DataSet readHdf5Data(Path path) {

@@ -31,6 +31,9 @@
 import software.amazon.awssdk.transfer.s3.model.FileDownload;
 import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener;
 
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -54,8 +57,9 @@ public class DataSetLoaderMFD implements DataSetLoader {
     /**
      * {@inheritDoc}
      */
-    public Optional<DataSet> loadDataSet(String fileName) {
-        return maybeDownloadFvecs(fileName).map(MultiFileDatasource::load);
+    public Optional<DataSetInfo> loadDataSet(String fileName) {
+        return maybeDownloadFvecs(fileName).map(mfd ->
+                new DataSetInfo(mfd.name, VectorSimilarityFunction.COSINE, mfd::load));
     }
 
     private Optional<MultiFileDatasource> maybeDownloadFvecs(String name) {
@@ -95,19 +99,39 @@ private Optional<MultiFileDatasource> maybeDownloadFvecs(String name) {
                                 .build();
 
                 // 3 retries
+                boolean downloaded = false;
                 for (int i = 0; i < 3; i++) {
-                    FileDownload downloadFile = tm.downloadFile(downloadFileRequest);
-                    CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
-                    long downloadedSize = Files.size(localPath);
+                    try {
+                        FileDownload downloadFile = tm.downloadFile(downloadFileRequest);
+                        CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
+                        long downloadedSize = Files.size(localPath);
+
+                        // Check if downloaded file size matches the expected size
+                        if (downloadedSize != downloadResult.response().contentLength()) {
+                            logger.error("Incomplete download (got {} of {} bytes). Retrying...",
+                                    downloadedSize, downloadResult.response().contentLength());
+                            Files.deleteIfExists(localPath);
+                            continue;
+                        }
+
+                        // Validate the file header to catch corrupt downloads
+                        if (!validateVecFileHeader(localPath)) {
+                            logger.error("Downloaded file {} has an invalid header; deleting and retrying", urlPath);
+                            Files.deleteIfExists(localPath);
+                            continue;
+                        }
 
-                    // Check if downloaded file size matches the expected size
-                    if (downloadedSize == downloadResult.response().contentLength()) {
                         logger.info("Downloaded file of length " + downloadedSize);
-                        break;  // Successfully downloaded
-                    } else {
-                        logger.error("Incomplete download. Retrying...");
+                        downloaded = true;
+                        break;
+                    } catch (Exception e) {
+                        logger.error("Download attempt {} failed for {}: {}", i + 1, urlPath, e.getMessage());
+                        Files.deleteIfExists(localPath);
                     }
                 }
+                if (!downloaded) {
+                    throw new IOException("Failed to download " + urlPath + " after 3 attempts");
+                }
             }
             tm.close();
         } catch (Exception e) {
@@ -117,6 +141,17 @@ private Optional<MultiFileDatasource> maybeDownloadFvecs(String name) {
         return Optional.of(mfd);
     }
 
+    /// Reads the first 4 bytes of a vec file (fvecs or ivecs) and checks that the
+    /// little-endian int32 dimension/count value is positive and reasonable.
+    private static boolean validateVecFileHeader(Path path) {
+        try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))) {
+            int dimension = Integer.reverseBytes(dis.readInt());
+            return dimension > 0 && dimension <= 100_000;
+        } catch (IOException e) {
+            return false;
+        }
+    }
+
     private static S3AsyncClientBuilder s3AsyncClientBuilder() {
         return S3AsyncClient.builder()
                 .region(Region.US_EAST_1)

@@ -25,6 +25,14 @@
 import java.util.List;
 import java.util.Optional;
 
+/// Facade for locating datasets across multiple {@link DataSetLoader} implementations.
+///
+/// Returns a {@link DataSetInfo} handle whose vector data is loaded lazily on the first
+/// call to {@link DataSetInfo#getDataSet()}, allowing callers to inspect dataset metadata
+/// (name, similarity function) without incurring the cost of reading vectors into memory.
+///
+/// @see DataSetInfo
+/// @see DataSetLoader
 public class DataSets {
     private static final Logger logger = LoggerFactory.getLogger(DataSets.class);
 
@@ -33,19 +41,28 @@ public class DataSets {
         add(new DataSetLoaderMFD());
     }};
 
-    public static Optional<DataSet> loadDataSet(String dataSetName) {
+    /// Loads a dataset by name using the {@link #defaultLoaders}.
+    ///
+    /// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"})
+    /// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name
+    public static Optional<DataSetInfo> loadDataSet(String dataSetName) {
         return loadDataSet(dataSetName, defaultLoaders);
     }
 
-    public static Optional<DataSet> loadDataSet(String dataSetName, Collection<DataSetLoader> loaders) {
+    /// Loads a dataset by name, trying each loader in order until one matches.
+    ///
+    /// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"})
+    /// @param loaders     the loaders to try, in priority order
+    /// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name
+    public static Optional<DataSetInfo> loadDataSet(String dataSetName, Collection<DataSetLoader> loaders) {
         logger.info("loading dataset [{}]", dataSetName);
         if (dataSetName.endsWith(".hdf5")) {
             throw new InvalidParameterException("DataSet names are not meant to be file names. Did you mean " + dataSetName.replace(".hdf5", "") + "? ");
         }
 
         for (DataSetLoader loader : loaders) {
             logger.trace("trying loader [{}]", loader.getClass().getSimpleName());
-            Optional<DataSet> dataSetLoaded = loader.loadDataSet(dataSetName);
+            Optional<DataSetInfo> dataSetLoaded = loader.loadDataSet(dataSetName);
             if (dataSetLoaded.isPresent()) {
                 logger.info("dataset [{}] found with loader [{}]", dataSetName, loader.getClass().getSimpleName());
                 return dataSetLoaded;

@@ -52,7 +52,7 @@ public static void main(String[] args) throws IOException {
         // This is a preconfigured dataset that will be downloaded automatically.
         DataSet dataset = DataSets.loadDataSet("ada002-100k").orElseThrow(() ->
             new RuntimeException("Dataset doesn't exist or wasn't configured correctly")
-        );
+        ).getDataSet();
 
         // The loaded DataSet provides a RAVV over the base vectors
         RandomAccessVectorValues ravv = dataset.getBaseRavv();

@@ -62,7 +62,7 @@ public static void main(String[] args) throws IOException {
         // the base vectors in-memory.
         DataSet dataset = DataSets.loadDataSet("e5-small-v2-100k").orElseThrow(() ->
             new RuntimeException("Dataset doesn't exist or wasn't configured correctly")
-        );
+        ).getDataSet();
 
         // Remember that RAVVs need not be in-memory in the general case.
         // We will sample from this RAVV to compute the PQ codebooks.

@@ -39,7 +39,12 @@ public static List<VectorFloat<?>> readFvecs(String filePath) {
         try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) {
             while (dis.available() > 0) {
                 var dimension = Integer.reverseBytes(dis.readInt());
-                assert dimension > 0 : dimension;
+                if (dimension <= 0) {
+                    throw new IOException("Corrupt fvecs file: negative or zero dimension " + dimension + " (possible file corruption or wrong format)");
+                }
+                if (dimension > 100_000) {
+                    throw new IOException("Unreasonable dimension " + dimension + " in fvecs file (possible file corruption or wrong format)");
+                }
                 var buffer = new byte[dimension * Float.BYTES];
                 dis.readFully(buffer);
                 var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN);

@@ -304,7 +304,7 @@ public static void main(String[] args) throws IOException {
         System.out.println("Loading dataset: " + datasetName);
         DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
                 () -> new RuntimeException("Dataset " + datasetName + " not found")
-        );
+        ).getDataSet();
         System.out.printf("Loaded %d vectors of dimension %d%n", ds.getBaseVectors().size(), ds.getDimension());
 
         var floatVectors = ds.getBaseRavv();

@@ -46,7 +46,7 @@ public static class Parameters {
         public Parameters() {
             this.ds = new DataSetLoaderHDF5().loadDataSet("hdf5/glove-100-angular.hdf5").orElseThrow(
                     () -> new RuntimeException("Unable to load dataset: hdf5/glove-100-angular.hdf5" )
-            );
+            ).getDataSet();
             this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length());
         }
     }