Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ public static void main(String[] args) throws IOException {
try {
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Dataset " + datasetName + " not found")
);
).getDataSet();
logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size());

String normalizedDatasetName = datasetName;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List<Func
for (var datasetName : datasetNames) {
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Dataset " + datasetName + " not found")
);
).getDataSet();
Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ public static void main(String[] args) throws IOException {
String datasetName = config.dataset;
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Could not load dataset:" + datasetName)
);
).getDataSet();
// Register dataset info the first time we actually load the dataset for benchmarking
artifacts.registerDataset(datasetName, ds);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ public static void main(String[] args) throws IOException {

// Load dataset
var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
.orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"));
.orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"))
.getDataSet();

// Run artifacts + selections (sys_info/dataset_info/experiments.csv)
RunArtifacts artifacts = RunArtifacts.open(runCfg, List.of(config));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* Copyright DataStax, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.github.jbellis.jvector.example.benchmarks.datasets;

import io.github.jbellis.jvector.vector.VectorSimilarityFunction;

import java.util.function.Supplier;

/// A lightweight, lazy handle that separates *identifying* a dataset from *loading* its data.
///
/// Metadata such as the dataset name and similarity function are available immediately
/// without any I/O, while the expensive work of reading vectors, deduplicating, scrubbing
/// zero vectors, and normalizing is deferred until the first call to {@link #getDataSet()}.
///
/// This design allows callers to enumerate or filter available datasets cheaply, and
/// ensures that the full load-and-scrub pipeline runs at most once per handle thanks to
/// thread-safe caching.
///
/// Instances are created by {@link DataSetLoader} implementations; callers obtain them
/// through {@link DataSets#loadDataSet(String)}.
///
/// ### Typical usage
/// ```java
/// DataSetInfo info = DataSets.loadDataSet("ada002-100k").orElseThrow();
///
/// // Cheap — no vectors loaded yet
/// System.out.println(info.getName());
/// System.out.println(info.getSimilarityFunction());
///
/// // First call triggers full load; subsequent calls return the cached DataSet
/// DataSet ds = info.getDataSet();
/// ```
///
/// @see DataSet
/// @see DataSetLoader
/// @see DataSets
public class DataSetInfo {
private final String name;
private final VectorSimilarityFunction similarityFunction;
private final Supplier<DataSet> loader;
private volatile DataSet cached;

/// Creates a new dataset info handle.
///
/// The supplied {@code loader} will not be invoked until {@link #getDataSet()} is called.
/// It should perform the full load-and-scrub pipeline (read vectors, remove duplicates /
/// zero vectors, filter queries, normalize) and return a ready-to-use {@link DataSet}.
///
/// @param name the dataset name, used for display and lookup
/// @param similarityFunction the vector similarity function for this dataset
/// (e.g. {@link VectorSimilarityFunction#COSINE})
/// @param loader a supplier that performs the deferred load; invoked at most once
public DataSetInfo(String name, VectorSimilarityFunction similarityFunction, Supplier<DataSet> loader) {
this.name = name;
this.similarityFunction = similarityFunction;
this.loader = loader;
}

/// Returns the dataset name.
///
/// This is always available without triggering a data load.
public String getName() {
return name;
}

/// Returns the similarity function for this dataset.
///
/// This is always available without triggering a data load.
/// For MFD datasets this is always {@link VectorSimilarityFunction#COSINE};
/// for HDF5 datasets it is inferred from the filename (e.g. {@code -angular} or {@code -euclidean}).
public VectorSimilarityFunction getSimilarityFunction() {
return similarityFunction;
}

/// Returns the fully loaded and scrubbed {@link DataSet}.
///
/// On the first invocation this triggers the deferred load pipeline, which may involve
/// reading large vector files from disk, deduplication, zero-vector removal, and
/// normalization. The result is cached so that subsequent calls return immediately.
///
/// This method is thread-safe: concurrent callers will block until the first load
/// completes, after which all callers share the same cached instance.
///
/// @return the ready-to-use {@link DataSet}
public DataSet getDataSet() {
if (cached == null) {
synchronized (this) {
if (cached == null) {
cached = loader.get();
}
}
}
return cached;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,28 @@
*/
public interface DataSetLoader {
/**
* Implementations of this method <EM>MUST NOT</EM> throw exceptions related to the presence or absence of a
* Looks up a dataset by name and returns a lightweight {@link DataSetInfo} handle.
*
* <p>The returned handle provides the dataset name and similarity function immediately,
* without loading vector data into memory. The full {@link DataSet} (vectors, ground truth,
* etc.) is loaded lazily on the first call to {@link DataSetInfo#getDataSet()}.
*
* <p>Implementations <em>MUST NOT</em> throw exceptions related to the presence or absence of a
* requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with
* exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably
* return from this method, avoiding any {@link System#exit(int)} or similar calls.
* exceptions as usual, including any errors downloading or preparing a dataset which has been found.
* Implementors should reliably return from this method, avoiding any {@link System#exit(int)} or similar calls.
*
* <p>Implementations may perform file downloads or other preparation work before returning the handle,
* but should defer the expensive parsing and scrubbing of vector data to the {@link DataSetInfo} supplier.
*
* <HR/>
*
* Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
* <p>Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
* not found, and info level for when datasets are found and loaded. This can assist users troubleshooting
* diverse data sources.
*
* @param dataSetName
* @return a {@link DataSet}, if found
* @param dataSetName the logical dataset name (not a filename; do not include extensions like {@code .hdf5})
* @return a {@link DataSetInfo} handle for the dataset, if found
*/
Optional<DataSet> loadDataSet(String dataSetName);
Optional<DataSetInfo> loadDataSet(String dataSetName);
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,11 @@ public class DataSetLoaderHDF5 implements DataSetLoader {
/**
* {@inheritDoc}
*/
public Optional<DataSet> loadDataSet(String datasetName) {
return maybeDownloadHdf5(datasetName).map(this::readHdf5Data);
public Optional<DataSetInfo> loadDataSet(String datasetName) {
return maybeDownloadHdf5(datasetName).map(path -> {
VectorSimilarityFunction similarity = getVectorSimilarityFunction(path);
return new DataSetInfo(datasetName, similarity, () -> readHdf5Data(path));
});
}

private DataSet readHdf5Data(Path path) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
import software.amazon.awssdk.transfer.s3.model.FileDownload;
import software.amazon.awssdk.transfer.s3.progress.LoggingTransferListener;

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
Expand All @@ -54,8 +57,9 @@ public class DataSetLoaderMFD implements DataSetLoader {
/**
* {@inheritDoc}
*/
public Optional<DataSet> loadDataSet(String fileName) {
return maybeDownloadFvecs(fileName).map(MultiFileDatasource::load);
public Optional<DataSetInfo> loadDataSet(String fileName) {
return maybeDownloadFvecs(fileName).map(mfd ->
new DataSetInfo(mfd.name, VectorSimilarityFunction.COSINE, mfd::load));
}

private Optional<MultiFileDatasource> maybeDownloadFvecs(String name) {
Expand Down Expand Up @@ -95,19 +99,39 @@ private Optional<MultiFileDatasource> maybeDownloadFvecs(String name) {
.build();

// 3 retries
boolean downloaded = false;
for (int i = 0; i < 3; i++) {
FileDownload downloadFile = tm.downloadFile(downloadFileRequest);
CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
long downloadedSize = Files.size(localPath);
try {
FileDownload downloadFile = tm.downloadFile(downloadFileRequest);
CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
long downloadedSize = Files.size(localPath);

// Check if downloaded file size matches the expected size
if (downloadedSize != downloadResult.response().contentLength()) {
logger.error("Incomplete download (got {} of {} bytes). Retrying...",
downloadedSize, downloadResult.response().contentLength());
Files.deleteIfExists(localPath);
continue;
}

// Validate the file header to catch corrupt downloads
if (!validateVecFileHeader(localPath)) {
logger.error("Downloaded file {} has an invalid header; deleting and retrying", urlPath);
Files.deleteIfExists(localPath);
continue;
}

// Check if downloaded file size matches the expected size
if (downloadedSize == downloadResult.response().contentLength()) {
logger.info("Downloaded file of length " + downloadedSize);
break; // Successfully downloaded
} else {
logger.error("Incomplete download. Retrying...");
downloaded = true;
break;
} catch (Exception e) {
logger.error("Download attempt {} failed for {}: {}", i + 1, urlPath, e.getMessage());
Files.deleteIfExists(localPath);
}
}
if (!downloaded) {
throw new IOException("Failed to download " + urlPath + " after 3 attempts");
}
}
tm.close();
} catch (Exception e) {
Expand All @@ -117,6 +141,17 @@ private Optional<MultiFileDatasource> maybeDownloadFvecs(String name) {
return Optional.of(mfd);
}

/// Reads the first 4 bytes of a vec file (fvecs or ivecs) and checks that the
/// little-endian int32 dimension/count value is positive and reasonable.
private static boolean validateVecFileHeader(Path path) {
try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))) {
int dimension = Integer.reverseBytes(dis.readInt());
return dimension > 0 && dimension <= 100_000;
} catch (IOException e) {
return false;
}
}

private static S3AsyncClientBuilder s3AsyncClientBuilder() {
return S3AsyncClient.builder()
.region(Region.US_EAST_1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@
import java.util.List;
import java.util.Optional;

/// Facade for locating datasets across multiple {@link DataSetLoader} implementations.
///
/// Returns a {@link DataSetInfo} handle whose vector data is loaded lazily on the first
/// call to {@link DataSetInfo#getDataSet()}, allowing callers to inspect dataset metadata
/// (name, similarity function) without incurring the cost of reading vectors into memory.
///
/// @see DataSetInfo
/// @see DataSetLoader
public class DataSets {
private static final Logger logger = LoggerFactory.getLogger(DataSets.class);

Expand All @@ -33,19 +41,28 @@ public class DataSets {
add(new DataSetLoaderMFD());
}};

public static Optional<DataSet> loadDataSet(String dataSetName) {
/// Loads a dataset by name using the {@link #defaultLoaders}.
///
/// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"})
/// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name
public static Optional<DataSetInfo> loadDataSet(String dataSetName) {
return loadDataSet(dataSetName, defaultLoaders);
}

public static Optional<DataSet> loadDataSet(String dataSetName, Collection<DataSetLoader> loaders) {
/// Loads a dataset by name, trying each loader in order until one matches.
///
/// @param dataSetName the logical dataset name (e.g. {@code "ada002-100k"})
/// @param loaders the loaders to try, in priority order
/// @return a lazy {@link DataSetInfo} handle, or empty if no loader recognises the name
public static Optional<DataSetInfo> loadDataSet(String dataSetName, Collection<DataSetLoader> loaders) {
logger.info("loading dataset [{}]", dataSetName);
if (dataSetName.endsWith(".hdf5")) {
throw new InvalidParameterException("DataSet names are not meant to be file names. Did you mean " + dataSetName.replace(".hdf5", "") + "? ");
}

for (DataSetLoader loader : loaders) {
logger.trace("trying loader [{}]", loader.getClass().getSimpleName());
Optional<DataSet> dataSetLoaded = loader.loadDataSet(dataSetName);
Optional<DataSetInfo> dataSetLoaded = loader.loadDataSet(dataSetName);
if (dataSetLoaded.isPresent()) {
logger.info("dataset [{}] found with loader [{}]", dataSetName, loader.getClass().getSimpleName());
return dataSetLoaded;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public static void main(String[] args) throws IOException {
// This is a preconfigured dataset that will be downloaded automatically.
DataSet dataset = DataSets.loadDataSet("ada002-100k").orElseThrow(() ->
new RuntimeException("Dataset doesn't exist or wasn't configured correctly")
);
).getDataSet();

// The loaded DataSet provides a RAVV over the base vectors
RandomAccessVectorValues ravv = dataset.getBaseRavv();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public static void main(String[] args) throws IOException {
// the base vectors in-memory.
DataSet dataset = DataSets.loadDataSet("e5-small-v2-100k").orElseThrow(() ->
new RuntimeException("Dataset doesn't exist or wasn't configured correctly")
);
).getDataSet();

// Remember that RAVVs need not be in-memory in the general case.
// We will sample from this RAVV to compute the PQ codebooks.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,12 @@ public static List<VectorFloat<?>> readFvecs(String filePath) {
try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) {
while (dis.available() > 0) {
var dimension = Integer.reverseBytes(dis.readInt());
assert dimension > 0 : dimension;
if (dimension <= 0) {
throw new IOException("Corrupt fvecs file: negative or zero dimension " + dimension + " (possible file corruption or wrong format)");
}
if (dimension > 100_000) {
throw new IOException("Unreasonable dimension " + dimension + " in fvecs file (possible file corruption or wrong format)");
}
var buffer = new byte[dimension * Float.BYTES];
dis.readFully(buffer);
var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ public static void main(String[] args) throws IOException {
System.out.println("Loading dataset: " + datasetName);
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Dataset " + datasetName + " not found")
);
).getDataSet();
System.out.printf("Loaded %d vectors of dimension %d%n", ds.getBaseVectors().size(), ds.getDimension());

var floatVectors = ds.getBaseRavv();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public static class Parameters {
public Parameters() {
this.ds = new DataSetLoaderHDF5().loadDataSet("hdf5/glove-100-angular.hdf5").orElseThrow(
() -> new RuntimeException("Unable to load dataset: hdf5/glove-100-angular.hdf5" )
);
).getDataSet();
this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length());
}
}
Expand Down