This is a Part II project for the Computer Science Tripos at the University of Cambridge. Please see the proposal doc for full context!
gvecdb is a hybrid graph-vector database combining:
- LMDB for durable graph storage (nodes, edges, adjacency indices, properties)
- HNSW approximate nearest neighbor index with MVCC persistence
- Cap'n Proto for schema-aware zero-copy property storage and RPC
- Native float32 SIMD via OxCaml's unboxed types and AVX intrinsics
It can be used as an embedded OCaml library or as a standalone Cap'n Proto RPC server.
lib/- core librarygvecdb.ml/.mli- public APItypes.ml- core type definitions and database handlekeys.ml- bigstring key encoding/decoding for LMDBstore.ml- low-level LMDB operationsfloat32_vec.ml- SIMD float32 distance computationvector_file.ml- append-only mmap'd vector storagehnsw.ml- HNSW parametershnsw_page.ml- HNSW node page layout and serializationhnsw_mvcc.ml- HNSW MVCC persistence layerknn.ml- brute-force k-NN searchschema_registry.ml- runtime schema registration and persistencedynamic_reader.ml- read Cap'n Proto fields by name at runtimefilter.ml- property-based edge filteringprops_capnp.ml- schema metadata storagebitset.ml,int_heap.ml,int_topk.ml- data structures for searchmsync.ml/msync_stubs.c- mmap flush bindings
server/- Cap'n Proto RPC servergvecdb_api.capnp- RPC schemagvecdb_service.ml- server implementationgvecdb_client.ml- client wrappermain.ml- server entry point
test/- test suitebench/- benchmarks (ANN recall/QPS, insertion, graph ops, crash recovery, concurrency)scripts/- benchmark runners, dataset downloaders, plottingtest_schemas/- Cap'n Proto schemas for testingreports/- progress reports and design decisionsvendor/- vendored dependencies (ocaml-lmdb)demo/- arXiv Explorer full-stack demo app
OxCaml:
opam switch create 5.2.0+ox ocaml-variants.5.2.0+oxSystem dependencies (C libraries):
# Ubuntu/Debian
sudo apt install liblmdb-dev capnproto pkg-config
# macOS
brew install lmdb capnp pkg-config
# Arch
sudo pacman -S lmdb capnproto pkgconfOCaml dependencies:
opam install . --deps-only --with-test -ygit clone --recurse-submodules https://github.com/olifog/gvecdb-ocaml.git
cd gvecdb-ocaml
# If already cloned without submodules:
git submodule update --initdune builddune runtestdune exec server/main.exe -- --db /path/to/my.dbDefine your schemas in Cap'n Proto format (e.g., schemas.capnp):
struct Person {
name @0 :Text;
age @1 :UInt32;
email @2 :Text;
}
struct Knows {
since @0 :Int64;
strength @1 :Float32;
}Compile them in your dune file and use in OCaml:
(* in your dune file: (libraries gvecdb capnp) *)
module SchemaBuilder = Schemas.Make(Capnp.BytesMessage)
let () =
let db = match Gvecdb.create "/path/to/db" with
| Ok db -> db
| Error e -> failwith (Gvecdb.Error.to_string e)
in
(* register schemas for dynamic field access and filtering *)
let _ = Gvecdb.register_schema_from_capnp db
~kind:Gvecdb.Schema_registry.NodeSchemaKind
~type_name:"person" ~capnp_path:"schemas.capnp"
~struct_name:"Person" () in
(* create nodes *)
let alice = match Gvecdb.create_node db "person" with
| Ok id -> id | Error e -> failwith (Gvecdb.Error.to_string e) in
let bob = match Gvecdb.create_node db "person" with
| Ok id -> id | Error e -> failwith (Gvecdb.Error.to_string e) in
(* set properties as serialized Cap'n Proto bytes *)
let builder = SchemaBuilder.Builder.Person.init_root () in
SchemaBuilder.Builder.Person.name_set builder "Alice";
SchemaBuilder.Builder.Person.age_set_int_exn builder 30;
let msg = SchemaBuilder.Builder.Person.to_message builder in
let bs = (* serialize msg to bigstring *) in
ignore (Gvecdb.set_node_props db alice "person" bs);
(* create edges *)
ignore (Gvecdb.create_edge db "knows" alice bob);
(* query edges *)
let edges = match Gvecdb.get_outbound_edges db alice () with
| Ok es -> es | Error _ -> [] in
List.iter (fun (e : Gvecdb.edge_info) ->
Printf.printf "edge %Ld: [%s] %Ld -> %Ld\n"
e.id e.edge_type e.src e.dst
) edges;
(* add vectors and search *)
ignore (Gvecdb.with_transaction db (fun txn ->
let vec = Gvecdb.Float32_vec.of_array [| 1.0; 0.5; 0.3 |] in
ignore (Gvecdb.create_vector db ~txn Node alice "embedding" vec)));
(* read fields dynamically by name *)
(match Gvecdb.read_node_field db alice "age" with
| Ok (Gvecdb.Dynamic_reader.V_uint32 age) ->
Printf.printf "age: %ld\n" age
| _ -> ());
Gvecdb.close dbTo generate a coverage report locally:
mkdir -p _coverage
dune build --instrument-with bisect_ppx
for test in test_basic test_vectors test_hnsw test_hnsw_mvcc \
test_adjacency test_transactions test_schema_filter \
test_integration; do
BISECT_FILE=$PWD/_coverage/bisect dune exec test/${test}.exe
done
bisect-ppx-report summary --coverage-path _coverage
bisect-ppx-report html --coverage-path _coverage -o _coverage/html