diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java index 7922dd201..882608fbb 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java @@ -132,7 +132,7 @@ public static void main(String[] args) throws IOException { try { DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size()); String normalizedDatasetName = datasetName; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 559d665fc..78a85e1fc 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -93,7 +93,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index 710301054..343fcbd95 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -120,7 +120,7 @@ public static void main(String[] args) throws IOException { String datasetName = config.dataset; DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Could not load dataset:" + datasetName) - ); + ).getDataSet(); // Register dataset info the first time we actually load the dataset for benchmarking artifacts.registerDataset(datasetName, ds); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java index f118d7695..032ea2f6c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java @@ -38,7 +38,8 @@ public static void main(String[] args) throws IOException { // Load dataset var ds = new DataSetLoaderMFD().loadDataSet(datasetName) - .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found")); + .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found")) + .getDataSet(); // Run artifacts + selections (sys_info/dataset_info/experiments.csv) RunArtifacts artifacts = RunArtifacts.open(runCfg, List.of(config)); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java new file mode 100644 index 000000000..0824410ec --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetInfo.java @@ -0,0 +1,109 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.benchmarks.datasets; + +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; + +import java.util.function.Supplier; + +/// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data. +/// +/// Metadata such as the dataset name and similarity function are available immediately +/// without any I/O, while the expensive work of reading vectors, deduplicating, scrubbing +/// zero vectors, and normalizing is deferred until the first call to {@link #getDataSet()}. +/// +/// This design allows callers to enumerate or filter available datasets cheaply, and +/// ensures that the full load-and-scrub pipeline runs at most once per handle thanks to +/// thread-safe caching. +/// +/// Instances are created by {@link DataSetLoader} implementations; callers obtain them +/// through {@link DataSets#loadDataSet(String)}. +/// +/// ### Typical usage +/// ```java +/// DataSetInfo info = DataSets.loadDataSet("ada002-100k").orElseThrow(); +/// +/// // Cheap — no vectors loaded yet +/// System.out.println(info.getName()); +/// System.out.println(info.getSimilarityFunction()); +/// +/// // First call triggers full load; subsequent calls return the cached DataSet +/// DataSet ds = info.getDataSet(); +/// ``` +/// +/// @see DataSet +/// @see DataSetLoader +/// @see DataSets +public class DataSetInfo { + private final String name; + private final VectorSimilarityFunction similarityFunction; + private final Supplier loader; + private volatile DataSet cached; + + /// Creates a new dataset info handle. + /// + /// The supplied {@code loader} will not be invoked until {@link #getDataSet()} is called. + /// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates / + /// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}. + /// + /// @param name the dataset name, used for display and lookup + /// @param similarityFunction the vector similarity function for this dataset + /// (e.g. {@link VectorSimilarityFunction#COSINE}) + /// @param loader a supplier that performs the deferred load; invoked at most once + public DataSetInfo(String name, VectorSimilarityFunction similarityFunction, Supplier loader) { + this.name = name; + this.similarityFunction = similarityFunction; + this.loader = loader; + } + + /// Returns the dataset name. + /// + /// This is always available without triggering a data load. + public String getName() { + return name; + } + + /// Returns the similarity function for this dataset. + /// + /// This is always available without triggering a data load. + /// For MFD datasets this is always {@link VectorSimilarityFunction#COSINE}; + /// for HDF5 datasets it is inferred from the filename (e.g. {@code -angular} or {@code -euclidean}). + public VectorSimilarityFunction getSimilarityFunction() { + return similarityFunction; + } + + /// Returns the fully loaded and scrubbed {@link DataSet}. + /// + /// On the first invocation this triggers the deferred load pipeline, which may involve + /// reading large vector files from disk, deduplication, zero-vector removal, and + /// normalization. The result is cached so that subsequent calls return immediately. + /// + /// This method is thread-safe: concurrent callers will block until the first load + /// completes, after which all callers share the same cached instance. + /// + /// @return the ready-to-use {@link DataSet} + public DataSet getDataSet() { + if (cached == null) { + synchronized (this) { + if (cached == null) { + cached = loader.get(); + } + } + } + return cached; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java index d280fbf91..932ea2dc7 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoader.java @@ -23,19 +23,28 @@ */ public interface DataSetLoader { /** - * Implementations of this method MUST NOT throw exceptions related to the presence or absence of a + * Looks up a dataset by name and returns a lightweight {@link DataSetInfo} handle. + * + *

The returned handle provides the dataset name and similarity function immediately, + * without loading vector data into memory. The full {@link DataSet} (vectors, ground truth, + * etc.) is loaded lazily on the first call to {@link DataSetInfo#getDataSet()}. + * + *

Implementations MUST NOT throw exceptions related to the presence or absence of a * requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with - * exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably - * return from this method, avoiding any {@link System#exit(int)} or similar calls. + * exceptions as usual, including any errors downloading or preparing a dataset which has been found. + * Implementors should reliably return from this method, avoiding any {@link System#exit(int)} or similar calls. + * + *

Implementations may perform file downloads or other preparation work before returning the handle, + * but should defer the expensive parsing and scrubbing of vector data to the {@link DataSetInfo} supplier. * *


* - * Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are + *

Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are * not found, and info level for when datasets are found and loaded. This can assist users troubleshooting * diverse data sources. * - * @param dataSetName - * @return a {@link DataSet}, if found + * @param dataSetName the logical dataset name (not a filename; do not include extensions like {@code .hdf5}) + * @return a {@link DataSetInfo} handle for the dataset, if found */ - Optional loadDataSet(String dataSetName); + Optional loadDataSet(String dataSetName); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java index 072a9b764..d531234a9 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java @@ -48,8 +48,11 @@ public class DataSetLoaderHDF5 implements DataSetLoader { /** * {@inheritDoc} */ - public Optional loadDataSet(String datasetName) { - return maybeDownloadHdf5(datasetName).map(this::readHdf5Data); + public Optional loadDataSet(String datasetName) { + return maybeDownloadHdf5(datasetName).map(path -> { + VectorSimilarityFunction similarity = getVectorSimilarityFunction(path); + return new DataSetInfo(datasetName, similarity, () -> readHdf5Data(path)); + }); } private DataSet readHdf5Data(Path path) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java index 7381f0c35..37e9f8a84 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java @@ -31,6 +31,9 @@ import software.amazon.awssdk.transfer.s3.model.FileDownload; import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener; +import java.io.BufferedInputStream; +import java.io.DataInputStream; +import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -54,8 +57,9 @@ public class DataSetLoaderMFD implements DataSetLoader { /** * {@inheritDoc} */ - public Optional loadDataSet(String fileName) { - return maybeDownloadFvecs(fileName).map(MultiFileDatasource::load); + public Optional loadDataSet(String fileName) { + return maybeDownloadFvecs(fileName).map(mfd -> + new DataSetInfo(mfd.name, VectorSimilarityFunction.COSINE, mfd::load)); } private Optional maybeDownloadFvecs(String name) { @@ -95,19 +99,39 @@ private Optional maybeDownloadFvecs(String name) { .build(); // 3 retries + boolean downloaded = false; for (int i = 0; i < 3; i++) { - FileDownload downloadFile = tm.downloadFile(downloadFileRequest); - CompletedFileDownload downloadResult = downloadFile.completionFuture().join(); - long downloadedSize = Files.size(localPath); + try { + FileDownload downloadFile = tm.downloadFile(downloadFileRequest); + CompletedFileDownload downloadResult = downloadFile.completionFuture().join(); + long downloadedSize = Files.size(localPath); + + // Check if downloaded file size matches the expected size + if (downloadedSize != downloadResult.response().contentLength()) { + logger.error("Incomplete download (got {} of {} bytes). Retrying...", + downloadedSize, downloadResult.response().contentLength()); + Files.deleteIfExists(localPath); + continue; + } + + // Validate the file header to catch corrupt downloads + if (!validateVecFileHeader(localPath)) { + logger.error("Downloaded file {} has an invalid header; deleting and retrying", urlPath); + Files.deleteIfExists(localPath); + continue; + } - // Check if downloaded file size matches the expected size - if (downloadedSize == downloadResult.response().contentLength()) { logger.info("Downloaded file of length " + downloadedSize); - break; // Successfully downloaded - } else { - logger.error("Incomplete download. Retrying..."); + downloaded = true; + break; + } catch (Exception e) { + logger.error("Download attempt {} failed for {}: {}", i + 1, urlPath, e.getMessage()); + Files.deleteIfExists(localPath); } } + if (!downloaded) { + throw new IOException("Failed to download " + urlPath + " after 3 attempts"); + } } tm.close(); } catch (Exception e) { @@ -117,6 +141,17 @@ private Optional maybeDownloadFvecs(String name) { return Optional.of(mfd); } + /// Reads the first 4 bytes of a vec file (fvecs or ivecs) and checks that the + /// little-endian int32 dimension/count value is positive and reasonable. + private static boolean validateVecFileHeader(Path path) { + try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))) { + int dimension = Integer.reverseBytes(dis.readInt()); + return dimension > 0 && dimension <= 100_000; + } catch (IOException e) { + return false; + } + } + private static S3AsyncClientBuilder s3AsyncClientBuilder() { return S3AsyncClient.builder() .region(Region.US_EAST_1) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java index da27c8f2c..449ff4fc6 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSets.java @@ -25,6 +25,14 @@ import java.util.List; import java.util.Optional; +/// Facade for locating datasets across multiple {@link DataSetLoader} implementations. +/// +/// Returns a {@link DataSetInfo} handle whose vector data is loaded lazily on the first +/// call to {@link DataSetInfo#getDataSet()}, allowing callers to inspect dataset metadata +/// (name, similarity function) without incurring the cost of reading vectors into memory. +/// +/// @see DataSetInfo +/// @see DataSetLoader public class DataSets { private static final Logger logger = LoggerFactory.getLogger(DataSets.class); @@ -33,11 +41,20 @@ public class DataSets { add(new DataSetLoaderMFD()); }}; - public static Optional loadDataSet(String dataSetName) { + /// Loads a dataset by name using the {@link #defaultLoaders}. + /// + /// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"}) + /// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name + public static Optional loadDataSet(String dataSetName) { return loadDataSet(dataSetName, defaultLoaders); } - public static Optional loadDataSet(String dataSetName, Collection loaders) { + /// Loads a dataset by name, trying each loader in order until one matches. + /// + /// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"}) + /// @param loaders the loaders to try, in priority order + /// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name + public static Optional loadDataSet(String dataSetName, Collection loaders) { logger.info("loading dataset [{}]", dataSetName); if (dataSetName.endsWith(".hdf5")) { throw new InvalidParameterException("DataSet names are not meant to be file names. Did you mean " + dataSetName.replace(".hdf5", "") + "? "); @@ -45,7 +62,7 @@ public static Optional loadDataSet(String dataSetName, Collection dataSetLoaded = loader.loadDataSet(dataSetName); + Optional dataSetLoaded = loader.loadDataSet(dataSetName); if (dataSetLoaded.isPresent()) { logger.info("dataset [{}] found with loader [{}]", dataSetName, loader.getClass().getSimpleName()); return dataSetLoaded; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java index e38c5c5b8..cfb70da09 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/DiskIntro.java @@ -52,7 +52,7 @@ public static void main(String[] args) throws IOException { // This is a preconfigured dataset that will be downloaded automatically. DataSet dataset = DataSets.loadDataSet("ada002-100k").orElseThrow(() -> new RuntimeException("Dataset doesn't exist or wasn't configured correctly") - ); + ).getDataSet(); // The loaded DataSet provides a RAVV over the base vectors RandomAccessVectorValues ravv = dataset.getBaseRavv(); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java index 4e16cda5d..5f22b1cbc 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/tutorial/LargerThanMemory.java @@ -62,7 +62,7 @@ public static void main(String[] args) throws IOException { // the base vectors in-memory. DataSet dataset = DataSets.loadDataSet("e5-small-v2-100k").orElseThrow(() -> new RuntimeException("Dataset doesn't exist or wasn't configured correctly") - ); + ).getDataSet(); // Remember that RAVVs need not be in-memory in the general case. // We will sample from this RAVV to compute the PQ codebooks. diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java index dd130e04b..a491d0c9e 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java @@ -39,7 +39,12 @@ public static List> readFvecs(String filePath) { try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) { while (dis.available() > 0) { var dimension = Integer.reverseBytes(dis.readInt()); - assert dimension > 0 : dimension; + if (dimension <= 0) { + throw new IOException("Corrupt fvecs file: negative or zero dimension " + dimension + " (possible file corruption or wrong format)"); + } + if (dimension > 100_000) { + throw new IOException("Unreasonable dimension " + dimension + " in fvecs file (possible file corruption or wrong format)"); + } var buffer = new byte[dimension * Float.BYTES]; dis.readFully(buffer); var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java index 04f766f38..f3728234c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/graph/disk/ParallelWriteExample.java @@ -304,7 +304,7 @@ public static void main(String[] args) throws IOException { System.out.println("Loading dataset: " + datasetName); DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow( () -> new RuntimeException("Dataset " + datasetName + " not found") - ); + ).getDataSet(); System.out.printf("Loaded %d vectors of dimension %d%n", ds.getBaseVectors().size(), ds.getDimension()); var floatVectors = ds.getBaseRavv(); diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java index 559048dcb..8e9cc712f 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java @@ -46,7 +46,7 @@ public static class Parameters { public Parameters() { this.ds = new DataSetLoaderHDF5().loadDataSet("hdf5/glove-100-angular.hdf5").orElseThrow( () -> new RuntimeException("Unable to load dataset: hdf5/glove-100-angular.hdf5" ) - ); + ).getDataSet(); this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length()); } }