diff --git a/npm/packages/ruvector/bin/cli.js b/npm/packages/ruvector/bin/cli.js index 31d86c670..13d2ff0be 100755 --- a/npm/packages/ruvector/bin/cli.js +++ b/npm/packages/ruvector/bin/cli.js @@ -45,6 +45,64 @@ function requireRuvector() { } } +// ============================================================================= +// Database metadata sidecar (#417) +// ----------------------------------------------------------------------------- +// `` is a redb (Rust binary) file managed by @ruvector/core. It is NOT +// a JSON document, so the previous implementation that called +// `JSON.parse(fs.readFileSync(dbPath))` to recover dimensions crashed +// immediately on the redb magic bytes "redb…". +// +// Instead, every `create` writes `.meta.json` carrying the construction +// args (dimensions, metric, schema version). `insert`, `search`, `stats` and +// friends read from the sidecar and pass them straight to the wrapper +// constructor. +// ============================================================================= + +const META_SCHEMA_VERSION = 1; + +function metaPathFor(dbPath) { + return `${dbPath}.meta.json`; +} + +function writeMeta(dbPath, meta) { + const payload = { + schemaVersion: META_SCHEMA_VERSION, + dimensions: meta.dimensions, + metric: meta.metric, + cliVersion: packageJson.version, + createdAt: new Date().toISOString(), + }; + fs.writeFileSync(metaPathFor(dbPath), JSON.stringify(payload, null, 2)); +} + +function readMeta(dbPath) { + const metaPath = metaPathFor(dbPath); + if (!fs.existsSync(metaPath)) { + if (!fs.existsSync(dbPath)) { + throw new Error( + `Database not found: ${dbPath}\n` + + ` Run "ruvector create ${dbPath}" first.`, + ); + } + throw new Error( + `Database metadata sidecar not found: ${metaPath}\n` + + ` This database was created without a sidecar (e.g. before #417 was fixed).\n` + + ` Recreate it with "ruvector create ${dbPath} -d -m ".`, + ); + } + let parsed; + try { + parsed = JSON.parse(fs.readFileSync(metaPath, 'utf8')); + } catch (e) { + throw new Error(`Invalid sidecar at ${metaPath}: ${e.message}`); + } + if (typeof parsed.dimensions !== 'number' || parsed.dimensions <= 0) { + throw new Error(`Invalid sidecar at ${metaPath}: missing or invalid dimensions`); + } + return parsed; +} + // Lazy load GNN (optional - loaded on first use, not at startup) // Saves ~6ms startup time by deferring require('@ruvector/gnn') let _gnnModule = undefined; // undefined = not yet attempted, null = failed, object = loaded @@ -157,16 +215,25 @@ program const spinner = ora('Creating database...').start(); try { - const dimension = parseInt(options.dimension); - const db = new VectorDB({ - dimensions: dimension, + const dimensions = parseInt(options.dimension); + // Construct the redb-backed DB; this creates the file at `dbPath`. + // Persistence is automatic via `storagePath` — there is no + // separate save() call. + // eslint-disable-next-line no-new + new VectorDB({ + dimensions, metric: options.metric, storagePath: dbPath, }); + // Persist the construction args so subsequent commands can recover + // them without trying to JSON.parse() the redb binary (#417). + writeMeta(dbPath, { dimensions, metric: options.metric }); + spinner.succeed(chalk.green(`Database created: ${dbPath}`)); - console.log(chalk.gray(` Dimension: ${dimension}`)); + console.log(chalk.gray(` Dimension: ${dimensions}`)); console.log(chalk.gray(` Metric: ${options.metric}`)); + console.log(chalk.gray(` Sidecar: ${metaPathFor(dbPath)}`)); console.log(chalk.gray(` Implementation: ${getImplementationType()}`)); } catch (error) { spinner.fail(chalk.red('Failed to create database')); @@ -180,43 +247,39 @@ program .command('insert ') .description('Insert vectors from JSON file') .option('-b, --batch-size ', 'Batch size for insertion', '1000') - .action((dbPath, file, options) => { + .action(async (dbPath, file, options) => { requireRuvector(); const spinner = ora('Loading database...').start(); try { - // Read database metadata to get dimension - let dimension = 384; // default - if (fs.existsSync(dbPath)) { - const dbData = fs.readFileSync(dbPath, 'utf8'); - const parsed = JSON.parse(dbData); - dimension = parsed.dimension || 384; - } - - const db = new VectorDB({ dimension }); - - if (fs.existsSync(dbPath)) { - db.load(dbPath); - } + const meta = readMeta(dbPath); + const db = new VectorDB({ + dimensions: meta.dimensions, + metric: meta.metric, + storagePath: dbPath, + }); spinner.text = 'Reading vectors...'; const data = JSON.parse(fs.readFileSync(file, 'utf8')); const vectors = Array.isArray(data) ? data : [data]; + // Coerce integer ids to strings — the native binding requires string ids. + for (const v of vectors) { + if (typeof v.id === 'number') v.id = String(v.id); + } + spinner.text = `Inserting ${vectors.length} vectors...`; const batchSize = parseInt(options.batchSize); for (let i = 0; i < vectors.length; i += batchSize) { const batch = vectors.slice(i, i + batchSize); - db.insertBatch(batch); + await db.insertBatch(batch); spinner.text = `Inserted ${Math.min(i + batchSize, vectors.length)}/${vectors.length} vectors...`; } - db.save(dbPath); + const total = await db.len(); spinner.succeed(chalk.green(`Inserted ${vectors.length} vectors`)); - - const stats = db.stats(); - console.log(chalk.gray(` Total vectors: ${stats.count}`)); + console.log(chalk.gray(` Total vectors: ${total}`)); } catch (error) { spinner.fail(chalk.red('Failed to insert vectors')); console.error(chalk.red(error.message)); @@ -232,18 +295,17 @@ program .option('-k, --top-k ', 'Number of results', '10') .option('-t, --threshold ', 'Similarity threshold', '0.0') .option('-f, --filter ', 'Metadata filter as JSON') - .action((dbPath, options) => { + .action(async (dbPath, options) => { requireRuvector(); const spinner = ora('Loading database...').start(); try { - // Read database metadata - const dbData = fs.readFileSync(dbPath, 'utf8'); - const parsed = JSON.parse(dbData); - const dimension = parsed.dimension || 384; - - const db = new VectorDB({ dimension }); - db.load(dbPath); + const meta = readMeta(dbPath); + const db = new VectorDB({ + dimensions: meta.dimensions, + metric: meta.metric, + storagePath: dbPath, + }); spinner.text = 'Searching...'; @@ -251,18 +313,21 @@ program const query = { vector, k: parseInt(options.topK), - threshold: parseFloat(options.threshold) }; if (options.filter) { query.filter = JSON.parse(options.filter); } - const results = db.search(query); - spinner.succeed(chalk.green(`Found ${results.length} results`)); + const results = await db.search(query); + const threshold = parseFloat(options.threshold); + const filtered = threshold > 0 + ? results.filter((r) => r.score >= threshold) + : results; + spinner.succeed(chalk.green(`Found ${filtered.length} results`)); console.log(chalk.cyan('\nSearch Results:')); - results.forEach((result, i) => { + filtered.forEach((result, i) => { console.log(chalk.white(`\n${i + 1}. ID: ${result.id}`)); console.log(chalk.yellow(` Score: ${result.score.toFixed(4)}`)); if (result.metadata) { @@ -280,35 +345,32 @@ program program .command('stats ') .description('Show database statistics') - .action((dbPath) => { + .action(async (dbPath) => { requireRuvector(); const spinner = ora('Loading database...').start(); try { - const dbData = fs.readFileSync(dbPath, 'utf8'); - const parsed = JSON.parse(dbData); - const dimension = parsed.dimension || 384; - - const db = new VectorDB({ dimension }); - db.load(dbPath); + const meta = readMeta(dbPath); + const db = new VectorDB({ + dimensions: meta.dimensions, + metric: meta.metric, + storagePath: dbPath, + }); - const stats = db.stats(); + const count = await db.len(); spinner.succeed(chalk.green('Database statistics')); console.log(chalk.cyan('\nDatabase Stats:')); - console.log(chalk.white(` Vector Count: ${chalk.yellow(stats.count)}`)); - console.log(chalk.white(` Dimension: ${chalk.yellow(stats.dimension)}`)); - console.log(chalk.white(` Metric: ${chalk.yellow(stats.metric)}`)); + console.log(chalk.white(` Vector Count: ${chalk.yellow(count)}`)); + console.log(chalk.white(` Dimension: ${chalk.yellow(meta.dimensions)}`)); + console.log(chalk.white(` Metric: ${chalk.yellow(meta.metric)}`)); console.log(chalk.white(` Implementation: ${chalk.yellow(getImplementationType())}`)); - if (stats.memoryUsage) { - const mb = (stats.memoryUsage / (1024 * 1024)).toFixed(2); - console.log(chalk.white(` Memory Usage: ${chalk.yellow(mb + ' MB')}`)); + if (fs.existsSync(dbPath)) { + const fileStats = fs.statSync(dbPath); + const fileMb = (fileStats.size / (1024 * 1024)).toFixed(2); + console.log(chalk.white(` File Size: ${chalk.yellow(fileMb + ' MB')}`)); } - - const fileStats = fs.statSync(dbPath); - const fileMb = (fileStats.size / (1024 * 1024)).toFixed(2); - console.log(chalk.white(` File Size: ${chalk.yellow(fileMb + ' MB')}`)); } catch (error) { spinner.fail(chalk.red('Failed to load database')); console.error(chalk.red(error.message)); @@ -323,7 +385,7 @@ program .option('-d, --dimension ', 'Vector dimension', '384') .option('-n, --num-vectors ', 'Number of vectors', '10000') .option('-q, --num-queries ', 'Number of queries', '1000') - .action((options) => { + .action(async (options) => { requireRuvector(); console.log(chalk.cyan('\nruvector Performance Benchmark')); console.log(chalk.gray(`Implementation: ${getImplementationType()}\n`)); @@ -338,7 +400,7 @@ program const db = new VectorDB({ dimensions: dimension, metric: 'cosine' }); spinner.succeed(); - // Insert benchmark + // Insert benchmark — must await, the wrapper resolves on actual native completion. spinner = ora(`Inserting ${numVectors} vectors...`).start(); const insertStart = Date.now(); @@ -351,14 +413,16 @@ program }); } - db.insertBatch(vectors); + await db.insertBatch(vectors); const insertTime = Date.now() - insertStart; const insertRate = (numVectors / (insertTime / 1000)).toFixed(0); spinner.succeed(chalk.green(`Inserted ${numVectors} vectors in ${insertTime}ms`)); console.log(chalk.gray(` Rate: ${chalk.yellow(insertRate)} vectors/sec`)); - // Search benchmark + // Search benchmark — must await each query (#417: previously the + // promises were dropped on the floor and the reported rate was just + // spinner timing). spinner = ora(`Running ${numQueries} searches...`).start(); const searchStart = Date.now(); @@ -367,7 +431,7 @@ program vector: Array.from({ length: dimension }, () => Math.random()), k: 10 }; - db.search(query); + await db.search(query); } const searchTime = Date.now() - searchStart; diff --git a/npm/packages/ruvector/test/cli-fresh-db.test.mjs b/npm/packages/ruvector/test/cli-fresh-db.test.mjs new file mode 100644 index 000000000..54a15b743 --- /dev/null +++ b/npm/packages/ruvector/test/cli-fresh-db.test.mjs @@ -0,0 +1,114 @@ +// End-to-end CLI smoke for issue #417 — verifies that `ruvector create`, +// `insert`, `search`, and `stats` work on a fresh database (the old +// implementation crashed on every command after `create` because it +// JSON.parse()d the redb binary file). +// +// Run with: +// +// node test/cli-fresh-db.test.mjs + +import { spawnSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; +import { mkdtempSync, rmSync, writeFileSync, existsSync, readFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const repoRoot = join(__dirname, '..'); +const cli = join(repoRoot, 'bin/cli.js'); +const tmp = mkdtempSync(join(tmpdir(), 'ruvector-417-')); +const dbPath = join(tmp, 'test.db'); + +let failures = 0; +function check(cond, msg, extra) { + if (!cond) { + console.error('FAIL:', msg); + if (extra) console.error(' ', extra); + failures++; + } else { + console.log(' ok:', msg); + } +} + +function runCli(args, env = {}) { + return spawnSync(process.execPath, [cli, ...args], { + cwd: repoRoot, + encoding: 'utf8', + env: { ...process.env, ...env }, + }); +} + +try { + // 1. create — should succeed AND drop a sidecar. + let res = runCli(['create', dbPath, '-d', '8', '-m', 'cosine']); + check(res.status === 0, '`ruvector create` exits 0', res.stderr || res.stdout); + check(existsSync(dbPath), 'redb file exists at dbPath'); + const sidecar = `${dbPath}.meta.json`; + check(existsSync(sidecar), 'sidecar metadata file exists'); + const meta = JSON.parse(readFileSync(sidecar, 'utf8')); + check(meta.dimensions === 8, 'sidecar.dimensions = 8'); + check(meta.metric === 'cosine', 'sidecar.metric = cosine'); + + // 2. insert — should NOT crash with `Unexpected token 'r'` from JSON.parse(redb). + const vectorsPath = join(tmp, 'vecs.json'); + const vectors = [ + { id: 'a', vector: [1, 0, 0, 0, 0, 0, 0, 0] }, + { id: 'b', vector: [0, 1, 0, 0, 0, 0, 0, 0] }, + { id: 'c', vector: [0, 0, 1, 0, 0, 0, 0, 0] }, + ]; + writeFileSync(vectorsPath, JSON.stringify(vectors)); + res = runCli(['insert', dbPath, vectorsPath]); + check(res.status === 0, '`ruvector insert` exits 0', res.stderr || res.stdout); + check( + !res.stderr.includes('Unexpected token') && !res.stdout.includes('Unexpected token'), + 'insert does not crash JSON.parsing the redb binary', + res.stderr || res.stdout, + ); + + // 3. search — should NOT crash, should return at least one hit. + res = runCli([ + 'search', + dbPath, + '-v', + JSON.stringify([1, 0, 0, 0, 0, 0, 0, 0]), + '-k', + '3', + ]); + const searchOut = res.stdout + res.stderr; + check(res.status === 0, '`ruvector search` exits 0', res.stderr || res.stdout); + check( + /Found\s+\d+\s+results?/.test(searchOut), + 'search prints `Found N results` (across stdout/stderr)', + searchOut, + ); + check( + res.stdout.includes('ID: a') || res.stdout.includes('ID: b'), + 'search renders at least one hit row', + res.stdout, + ); + + // 4. stats — should NOT crash, should report Vector Count. + res = runCli(['stats', dbPath]); + check(res.status === 0, '`ruvector stats` exits 0', res.stderr || res.stdout); + check(res.stdout.includes('Vector Count'), 'stats prints Vector Count', res.stdout); + + // 5. helpful error when sidecar is absent (regression guard for the + // "user constructs DB without create" path). + const orphanDb = join(tmp, 'orphan.db'); + writeFileSync(orphanDb, 'redb-fake-binary'); // pretend a redb file existed + res = runCli(['stats', orphanDb]); + check(res.status !== 0, 'stats fails fast on orphan DB without sidecar'); + check( + (res.stderr + res.stdout).includes('sidecar'), + 'orphan-DB error message mentions sidecar', + res.stderr || res.stdout, + ); +} finally { + try { rmSync(tmp, { recursive: true, force: true }); } catch {} +} + +if (failures > 0) { + console.error(`\n${failures} check(s) failed`); + process.exit(1); +} +console.log(`\nruvector fresh-DB CLI smoke OK (issue #417)`);