Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions .github/workflows/docker/Dockerfile.linux_aarch64_glibc228
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# =============================================================================
# Dockerfile.linux_aarch64_glibc228
# Purpose: Ubuntu 18.10 gcc-9 + glibc 2.28 + CMake 3.30.0 + PyBind11 build environment (arm64)
# Warning: ubuntu:18.10 is EOL; use only for glibc 2.28 compatibility testing.
# =============================================================================

# Use official Ubuntu 18.10 (Cosmic Cuttlefish), aarch64 variant
# glibc version: 2.28 (confirmed via `ldd --version`)
FROM --platform=linux/arm64 ubuntu:18.10

# Replace Ubuntu mirror with old-releases.ubuntu.com/ubuntu-ports for older glibc compatibility
# Note: arm64 archives live on ports.ubuntu.com, not archive.ubuntu.com
RUN sed -i 's|http://ports.ubuntu.com/ubuntu-ports|http://old-releases.ubuntu.com/ubuntu-ports|g' /etc/apt/sources.list

# Add Ubuntu 20.04 (focal) repo for GCC 9 ONLY
RUN echo "deb http://ports.ubuntu.com/ubuntu-ports/ focal main universe" >> /etc/apt/sources.list && \
echo "deb http://ports.ubuntu.com/ubuntu-ports/ focal-security main universe" >> /etc/apt/sources.list

# Prevent interactive prompts & set non-root user
ENV DEBIAN_FRONTEND=noninteractive \
TZ=Etc/UTC

# Create non-root user for safety (optional but recommended)
RUN useradd -m -u 1000 builder && \
mkdir -p /workspace && chown builder:builder /workspace

# Install base system dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
gcc-9 g++-9 \
ninja-build git curl ca-certificates vim wget lcov gnupg clang-format-18\
rsync lsb-release \
uuid-dev zlib1g-dev libssl-dev libffi-dev \
pybind11-dev && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 90 \
--slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
rm -rf /var/lib/apt/lists/*

# Install Miniforge (Conda) as root, then assign to builder
ENV MINIFORGE_VERSION="latest"
ENV MINIFORGE_HOME="/opt/miniforge3"

RUN curl -sSL "https://github.com/conda-forge/miniforge/releases/${MINIFORGE_VERSION}/download/Miniforge3-Linux-aarch64.sh" -o miniforge.sh && \
bash miniforge.sh -b -p ${MINIFORGE_HOME} && \
rm miniforge.sh && \
chown -R builder:builder ${MINIFORGE_HOME}

# Switch to non-root user
USER builder
ENV PATH="${MINIFORGE_HOME}/bin:${PATH}"
WORKDIR /workspace

# Create conda envs for supported Python versions
RUN conda create -n py310 python=3.10 -y && \
conda create -n py311 python=3.11 -y && \
conda create -n py312 python=3.12 -y
RUN conda clean --all -f -y

# Install CMake 3.30.0 from Kitware official binary
# Ref: https://github.com/Kitware/CMake/releases/tag/v3.30.0
RUN mkdir -p /tmp/cmake && cd /tmp/cmake && \
curl -sSL -o cmake.tar.gz \
"https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-aarch64.tar.gz" && \
tar -xzf cmake.tar.gz --strip-components=1 -C /tmp/cmake && \
mkdir -p /home/builder/.local && \
mv * /home/builder/.local/ && \
chown -R builder:builder /home/builder/.local && \
rm -rf /tmp/cmake

# Add CMake to PATH
ENV PATH="/home/builder/.local/bin:${PATH}"

# Verify installations
RUN cmake --version && \
conda info && \
conda env list && \
python --version && \
gcc --version && \
ldd --version | head -n1

# Final setup
WORKDIR /workspace
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ dist
html
*.lcov.info

# Java / Maven build outputs
target/
*.jar
*.class
*.dll
*.dylib
*.so

# Dependencies
/node_modules

Expand All @@ -51,4 +59,4 @@ allure-*

!build_android.sh
!build_ios.sh

!scripts/build_java_native.sh
19 changes: 18 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set(CC_CXX_STANDARD 17)
if(MSVC)
set(INTTYPES_FORMAT VC7)
add_compile_options(/FS) # handle .pdb concurrency
add_compile_options(/utf-8)
add_compile_options(/EHsc) # def c++ exception behavior
add_compile_options(/Zc:preprocessor /Zc:__cplusplus)
add_compile_options(/we4716) # -Werror=return-type
Expand Down Expand Up @@ -73,6 +74,12 @@ message(STATUS "BUILD_PYTHON_BINDINGS:${BUILD_PYTHON_BINDINGS}")
option(BUILD_C_BINDINGS "Build C bindings" ON)
message(STATUS "BUILD_C_BINDINGS:${BUILD_C_BINDINGS}")

option(BUILD_JAVA_JNI_BINDING "Build Java JNI binding" OFF)
message(STATUS "BUILD_JAVA_JNI_BINDING:${BUILD_JAVA_JNI_BINDING}")

option(BUILD_TESTS "Build tests" ON)
message(STATUS "BUILD_TESTS:${BUILD_TESTS}")

option(BUILD_TOOLS "Build tools" ON)
message(STATUS "BUILD_TOOLS:${BUILD_TOOLS}")

Expand Down Expand Up @@ -117,7 +124,17 @@ message(STATUS "USE_OSS_MIRROR:${USE_OSS_MIRROR}")

cc_directory(thirdparty)
cc_directories(src)
cc_directories(tests)

if(BUILD_TESTS)
cc_directories(tests)
endif()

if(BUILD_JAVA_JNI_BINDING)
if(NOT BUILD_C_BINDINGS)
message(FATAL_ERROR "BUILD_JAVA_JNI_BINDING requires BUILD_C_BINDINGS")
endif()
add_subdirectory(java/zvec-java/zvec-java-jni/src/main/native)
endif()

if(BUILD_TOOLS)
cc_directories(tools)
Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,19 @@ npm install @zvec/zvec

If you prefer to build Zvec from source, please check the [Building from Source](https://zvec.org/en/docs/db/build/) guide.

### Java (Preview)

Java preview bindings live in [`java/zvec-java`](./java/zvec-java).

Current validation target:

- Java 25
- macOS ARM64

The recommended Java API is the fluent layer (`ZvecSchemas`, `ZvecSearch`), while advanced users can still use the compatibility layer for direct `CollectionSchema`, `VectorSchema`, and `VectorQuery` control.

See [`java/zvec-java/README.md`](./java/zvec-java/README.md) for build and usage instructions.

## ⚡ One-Minute Example

```python
Expand Down
213 changes: 213 additions & 0 deletions java/zvec-java/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# zvec-java

Java bindings for zvec on the same desktop platforms supported by the native zvec build.

The recommended Java API is the fluent layer: `ZvecSchemas` for schema construction and `ZvecSearch` for query construction.

## Artifacts

- `org.zvec:zvec-java-jni`: JDK 11+ backend using JNI. This is the default choice for new Java users.
- `org.zvec:zvec-java-ffm`: JDK 25 backend using the Foreign Function & Memory API.
- `org.zvec:zvec-java-api`: public API and shared Java implementation. Backend artifacts bring this transitively.

The old `org.zvec:zvec-java` compatibility coordinate has been removed. Existing FFM users should depend on `org.zvec:zvec-java-ffm` directly.

Put exactly one backend artifact on the runtime classpath. If both JNI and FFM are present, startup fails unless `-Dorg.zvec.backend=jni` or `-Dorg.zvec.backend=ffm` is set.

## Requirements

- Java 25 for a full `java/zvec-java` reactor build
- Java 11+ for `zvec-java-jni`
- Java 25 for `zvec-java-ffm`
- Maven 3.8+
- CMake available on `PATH`

Native artifacts are packaged under `META-INF/native/<platform>`. Supported platform ids are
`darwin-aarch64`, `darwin-x86_64`, `linux-aarch64`, `linux-x86_64`, and `windows-x86_64`.
The Maven build uses `host` detection by default. Build each native package on a matching host
runner, or pass `-Dzvec.native.platform=<platform>` in that runner to make the package id explicit.

## Build

```bash
source "$HOME/.sdkman/bin/sdkman-init.sh"
cd java/zvec-java

JAVA_HOME="$HOME/.sdkman/candidates/java/25.0.2-oracle" \
mvn test

# JDK 11 JNI path
JAVA_HOME="$HOME/.sdkman/candidates/java/11.0.26-amzn" \
mvn -pl zvec-java-jni -am test

# JDK 25 FFM path
JAVA_HOME="$HOME/.sdkman/candidates/java/25.0.2-oracle" \
mvn -pl zvec-java-ffm -am test

# Explicit platform package on a Linux x86_64 CI runner
JAVA_HOME="$HOME/.sdkman/candidates/java/11.0.26-amzn" \
mvn -pl zvec-java-jni -am test -Dzvec.native.platform=linux-x86_64
```

## Multi-Platform Release Artifacts

Do not commit built jars or native libraries to git. Keep `target/` local while preparing
release assets, then upload the assembled jars to GitHub Releases or publish them to a
Maven repository.

The release flow is:

1. Build each native platform on a matching runner or machine.
2. Copy the resulting `META-INF/native/<platform>` files into the backend module's
`target/classes` directory.
3. Run `mvn package -DskipTests` without `clean` to preserve the copied platform files.
4. Upload `zvec-java-jni/target/zvec-java-jni-*.jar` and
`zvec-java-ffm/target/zvec-java-ffm-*.jar` as release assets.

The JNI jar should contain `zvec_c_api` and `zvec_java_jni` for each packaged platform.
The FFM jar only needs `zvec_c_api` for each packaged platform.

## Example

An executable JNI quickstart is available in `examples/quickstart-jni`. It consumes
`org.zvec:zvec-java-jni` as a normal Maven dependency.

```bash
cd java/zvec-java
mvn -pl zvec-java-jni -am install -DskipTests

cd examples/quickstart-jni
mvn compile exec:java
```

## Quick Start

```java
import java.util.List;
import org.zvec.Doc;
import org.zvec.Collection;
import org.zvec.CollectionSchema;
import org.zvec.Zvec;
import org.zvec.ZvecSchemas;
import org.zvec.ZvecSearch;

CollectionSchema schema =
ZvecSchemas.collection("docs").string("title").vector("embedding", 4).balanced().build();

try (Collection collection = Zvec.createAndOpen("./docs", schema)) {
collection.insert(
List.of(
Doc.of("doc_1").field("title", "alpha").vector("embedding", new float[] {1f, 0f, 0f, 0f}),
Doc.of("doc_2").field("title", "beta").vector("embedding", new float[] {0f, 1f, 0f, 0f})));

List<Doc> results =
collection.query(
ZvecSearch.vector("embedding", new float[] {1f, 0f, 0f, 0f})
.topK(2)
.project("title")
.build());
}
```

## Common Tuning

Use these fluent tuning methods immediately after `vector(name, dimension)`:

- `fast()` when you want the fastest index build and can trade off some recall
- `balanced()` for the default middle ground
- `accurate()` when search quality matters more than build speed
- `expectedDocCount(...)` when you know the collection size ahead of time

Example:

```java
CollectionSchema schema =
ZvecSchemas.collection("docs")
.string("title")
.vector("embedding", 1536)
.expectedDocCount(1_000_000L)
.balanced()
.build();
```

## Advanced Control

If you need direct HNSW configuration, use the compatibility layer:

```java
import org.zvec.HnswIndexParams;
import org.zvec.HnswQueryParams;
import org.zvec.VectorQuery;
import org.zvec.VectorSchema;

VectorSchema schema =
new VectorSchema("embedding", org.zvec.DataType.VECTOR_FP32, 1536)
.withHnswIndex(new HnswIndexParams(32, 300));

VectorQuery query =
VectorQuery.of("embedding", new float[] {1f, 0f, 0f, 0f})
.hnsw(new HnswQueryParams(128, 0.0f, false, true));
```

## Encrypted Fields

Mark a string field as encrypted in the schema; insert and query call sites stay identical to plaintext code.

```java
import org.zvec.crypto.KeyProvider;

KeyProvider keys = keyId -> myKms.fetchKey(keyId); // 32 bytes for AES-256
CollectionSchema schema = ZvecSchemas.collection("docs")
.string("title")
.string("body").encrypted("body-key-v1")
.vector("embed", 768).balanced()
.build();

try (Collection col = Zvec.createAndOpen("./docs", schema, keys)) {
col.insert(List.of(
Doc.of("d1").field("title", "alpha")
.field("body", "plaintext stays plaintext at the call site")
.vector("embed", v)));

List<Doc> results = col.query(
ZvecSearch.vector("embed", q).topK(10).project("title", "body").build());

// results.get(0).fields().get("body") is already plaintext
}
```

Reopen with the same provider:

```java
try (Collection col = Zvec.openWithKeys("./docs", keys)) { ... }
```

Key rotation (new writes use the new keyId; existing records keep their original):

```java
col.setActiveKeyId("body", "body-key-v2");
```

**Key things to know:**

- AES-256-GCM with a 12-byte random nonce per field per record. Nonce reuse under the same key would be catastrophic; use a `SecureRandom`-backed flow or never reuse keys across processes that don't coordinate nonces.
- `id`, field name, and collection name are bound into AAD automatically. Moving ciphertext between docs/fields/collections is detected.
- Queries cannot filter on encrypted fields. `ZvecSearch.filter("body = 'x'")` throws `IllegalArgumentException`.
- Decryption failures (tamper, missing key, AAD mismatch) abort the entire query — fail-loud by design.
- The library never logs key material, plaintext, or ciphertext. Caller adds logging in their own try/catch as needed.
- A static-key form `.encrypted(keyId, byte[])` is available for tests and demos. Key bytes are never persisted; reopening still requires a `KeyProvider`.
- Sidecar metadata lives at `<collection>/_zvec_enc.json`. Don't hand-edit unless you know what you're doing.

## Scope

Current support:

- create/open collection
- insert documents
- dense float vector query
- string / bool / int64 / double scalar fields

Deferred:

- update, upsert, delete, fetch
- sparse vectors
Loading