Skip to content

Commit 653ec34

Browse files
authored
fix: align and persist remote embedding result metadata (#6)
- Unify embedding result metadata: replace n_tokens_truncated with a shared truncated boolean across local, remote, and custom engines. - Persist n_tokens and truncated on dbmem_vault and dbmem_cache rows, with schema versioning and automatic migration of existing databases. - Parse the documented vectors.space envelope (output_dimension, data[0].embedding, data[0].truncated, usage.request_tokens) instead of a flat key scan. - Add e2e coverage for multi-chunk retrieval and single-chunk inserts around the provider token ceiling and model context window; search tests print per-chunk n_tokens/truncated round-tripped through the API. - Run extension unit tests in CI (with local-only build fixes), use portable temp paths in sync tests, hash the sync fixture from disk, and isolate curl's ./configure from inherited shell build envs. - Update the C API reference for the truncated flag on custom provider results.
1 parent eafc5d8 commit 653ec34

10 files changed

Lines changed: 1272 additions & 52 deletions

File tree

.github/workflows/main.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ jobs:
364364
echo "::endgroup::"
365365
366366
echo "::group::build unittest binary for android"
367-
make build/unittest ${{ matrix.make }} SQLITE_AMALGAM=${SQLITE_DIR}/sqlite3.c
367+
make build/unittest ${{ matrix.make }} SQLITE_AMALGAM=${SQLITE_DIR}/sqlite3.c DEFINES="-DTEST_SQLITE_EXTENSION"
368368
echo "::endgroup::"
369369
370370
echo "::group::build e2e binary for android"
@@ -406,12 +406,12 @@ jobs:
406406
407407
- name: unix test sqlite-memory
408408
if: matrix.skip_test != true && matrix.os != 'windows-2022' && matrix.name != 'android'
409-
run: ${{ matrix.name == 'linux-musl' && matrix.arch == 'arm64' && 'docker exec alpine' || '' }} make test ${{ matrix.make && matrix.make || ''}}
409+
run: ${{ matrix.name == 'linux-musl' && matrix.arch == 'arm64' && 'docker exec alpine' || '' }} make test ${{ matrix.make && matrix.make || ''}} DEFINES="-DTEST_SQLITE_EXTENSION"
410410

411411
- name: windows test sqlite-memory
412412
if: matrix.skip_test != true && matrix.name == 'windows'
413413
shell: msys2 {0}
414-
run: make test ${{ matrix.make && matrix.make || ''}}
414+
run: make test ${{ matrix.make && matrix.make || ''}} DEFINES="-DTEST_SQLITE_EXTENSION"
415415

416416
- name: unix e2e sqlite-memory
417417
if: matrix.skip_test != true && matrix.variant != 'local' && matrix.os != 'windows-2022' && matrix.name != 'android'

API.md

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -564,8 +564,8 @@ typedef struct {
564564
**`dbmem_embedding_result_t` struct:**
565565
```c
566566
typedef struct {
567-
int n_tokens; // Number of tokens processed
568-
int n_tokens_truncated; // Tokens that were truncated (0 if none)
567+
int n_tokens; // Number of processed tokens (0 if unknown)
568+
bool truncated; // True when the input was truncated before embedding
569569
int n_embd; // Embedding dimension
570570
float *embedding; // Embedding vector (engine-owned, valid until next call or free)
571571
} dbmem_embedding_result_t;
@@ -574,6 +574,7 @@ typedef struct {
574574
**Notes:**
575575
- Works regardless of `DBMEM_OMIT_LOCAL_ENGINE` / `DBMEM_OMIT_REMOTE_ENGINE` compile flags
576576
- The `embedding` buffer in `dbmem_embedding_result_t` must remain valid until the next `compute` call or `free` — it is engine-owned, not copied by the caller
577+
- `n_tokens` is metadata about the processed input when the engine can provide it; `truncated` is a boolean flag, not a truncated-token count
577578
- Only one custom provider can be registered per connection at a time; registering again replaces the previous one
578579
- The provider struct is copied by value; the caller does not need to keep it alive after registration
579580

@@ -596,7 +597,7 @@ static int my_compute(void *engine, const char *text, int text_len, void *xdata,
596597
// ... fill vec with your embedding ...
597598
result->n_embd = e->dimension;
598599
result->n_tokens = text_len / 4;
599-
result->n_tokens_truncated = 0;
600+
result->truncated = false;
600601
result->embedding = vec;
601602
return 0;
602603
}
@@ -769,6 +770,21 @@ FROM dbmem_content
769770
WHERE last_accessed > 0
770771
ORDER BY last_accessed DESC
771772
LIMIT 10;
773+
774+
-- Tokens consumed and truncation per context
775+
-- (n_tokens / truncated were added in schema version 2)
776+
SELECT
777+
COALESCE(c.context, '(none)') as context,
778+
SUM(v.n_tokens) as tokens_processed,
779+
SUM(v.truncated) as truncated_chunks
780+
FROM dbmem_vault v
781+
JOIN dbmem_content c ON c.hash = v.hash
782+
GROUP BY c.context;
783+
784+
-- Chunks that the embedding model truncated on input
785+
SELECT hash, seq, length, n_tokens
786+
FROM dbmem_vault
787+
WHERE truncated = 1;
772788
```
773789

774790
---

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ ifeq ($(PLATFORM),windows)
561561
else
562562
unzip -o $(CURL_ZIP) -d $(CURL_DIR)/src/.
563563
endif
564-
cd $(CURL_SRC) && ./configure \
564+
cd $(CURL_SRC) && env -u LDFLAGS -u CPPFLAGS -u CFLAGS -u LIBS ./configure \
565565
--without-libpsl \
566566
--disable-alt-svc \
567567
--disable-ares \

src/dbmem-embed.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ typedef struct dbmem_remote_engine_t dbmem_remote_engine_t;
1717
// Embedding result structure (always one embedding per call)
1818
typedef struct {
1919
int n_tokens; // Number of tokens processed
20-
int n_tokens_truncated; // Number of tokens truncated (0 if none)
20+
bool truncated; // True when the input was truncated before embedding
2121
int n_embd; // Embedding dimension
2222
float *embedding; // Pointer to embedding (points to engine's buffer, do not free)
2323
} embedding_result_t;

src/dbmem-lembed.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,9 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
223223
}
224224

225225
// Handle token overflow: truncate to max context size
226-
int n_tokens_truncated = 0;
226+
bool truncated = false;
227227
if (n_tokens > engine->n_ctx) {
228-
n_tokens_truncated = n_tokens - engine->n_ctx;
228+
truncated = true;
229229
n_tokens = engine->n_ctx;
230230
}
231231

@@ -275,7 +275,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
275275

276276
// Fill result
277277
result->n_tokens = n_tokens;
278-
result->n_tokens_truncated = n_tokens_truncated;
278+
result->truncated = truncated;
279279
result->n_embd = engine->n_embd;
280280
result->embedding = engine->embedding;
281281

src/dbmem-rembed.c

Lines changed: 109 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,62 @@ static int set_json_error_message (dbmem_remote_engine_t *engine) {
210210
return -1;
211211
}
212212

213+
static int dbmem_json_skip_token (const jsmntok_t *tokens, int index) {
214+
int next = index + 1;
215+
216+
if (tokens[index].type == JSMN_ARRAY) {
217+
for (int i = 0; i < tokens[index].size; i++) {
218+
next = dbmem_json_skip_token(tokens, next);
219+
}
220+
return next;
221+
}
222+
223+
if (tokens[index].type == JSMN_OBJECT) {
224+
for (int i = 0; i < tokens[index].size; i++) {
225+
next += 1; // skip key token
226+
next = dbmem_json_skip_token(tokens, next);
227+
}
228+
return next;
229+
}
230+
231+
return next;
232+
}
233+
234+
static bool dbmem_json_token_equals (const char *json, const jsmntok_t *token, const char *text) {
235+
size_t len = strlen(text);
236+
size_t token_len = (size_t)(token->end - token->start);
237+
return token_len == len && memcmp(json + token->start, text, len) == 0;
238+
}
239+
240+
static int dbmem_json_object_find (const char *json, const jsmntok_t *tokens, int object_index, const char *key) {
241+
if (object_index < 0 || tokens[object_index].type != JSMN_OBJECT) return -1;
242+
243+
int index = object_index + 1;
244+
for (int i = 0; i < tokens[object_index].size; i++) {
245+
int key_index = index;
246+
int value_index = key_index + 1;
247+
248+
if (tokens[key_index].type != JSMN_STRING) return -1;
249+
if (dbmem_json_token_equals(json, &tokens[key_index], key)) return value_index;
250+
251+
index = dbmem_json_skip_token(tokens, value_index);
252+
}
253+
254+
return -1;
255+
}
256+
257+
static bool dbmem_json_parse_bool (const char *json, const jsmntok_t *token) {
258+
size_t len = (size_t)(token->end - token->start);
259+
return token->type == JSMN_PRIMITIVE && len == 4 && memcmp(json + token->start, "true", 4) == 0;
260+
}
261+
262+
#if ENABLE_DBMEM_DEBUG_EMBEDDING
263+
static void dbmem_remote_debug_log_response(dbmem_remote_engine_t *engine, long http_code) {
264+
const char *response = engine->data ? engine->data : "";
265+
DEBUG_DBMEM_ALWAYS("[dbmem-rembed] vectors.space response (HTTP %ld): %s", http_code, response);
266+
}
267+
#endif
268+
213269
// MARK: -
214270

215271
dbmem_remote_engine_t *dbmem_remote_engine_init (void *ctx, const char *provider, const char *model, char err_msg[DBMEM_ERRBUF_SIZE]) {
@@ -450,6 +506,10 @@ int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *t
450506
sqlite3_free(response_data);
451507
#endif
452508

509+
#if ENABLE_DBMEM_DEBUG_EMBEDDING
510+
dbmem_remote_debug_log_response(engine, http_code);
511+
#endif
512+
453513
if (http_code != 200) {
454514
return set_json_error_message(engine);
455515
}
@@ -480,29 +540,55 @@ int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *t
480540

481541
// extract fields
482542
int n_embd = 0;
483-
int prompt_tokens = 0;
484-
int estimated_prompt_tokens = 0;
543+
int request_tokens = 0;
544+
bool truncated = false;
485545
int emb_start = -1;
486546
size_t emb_count = 0;
487547

488-
for (int i = 0; i < ntokens - 1; i++) {
489-
if (tokens[i].type != JSMN_STRING) continue;
490-
int klen = tokens[i].end - tokens[i].start;
491-
const char *key = engine->data + tokens[i].start;
492-
493-
if (klen == 9 && memcmp(key, "embedding", 9) == 0 && tokens[i + 1].type == JSMN_ARRAY) {
494-
if (tokens[i + 1].size <= 0) {
495-
dbmem_context_set_error(engine->context, "Invalid embedding array size in API response");
496-
return -1;
497-
}
498-
emb_count = (size_t)tokens[i + 1].size;
499-
emb_start = i + 2;
500-
} else if (klen == 16 && memcmp(key, "output_dimension", 16) == 0) {
501-
n_embd = atoi(engine->data + tokens[i + 1].start);
502-
} else if (klen == 13 && memcmp(key, "prompt_tokens", 13) == 0 && tokens[i + 1].type == JSMN_PRIMITIVE) {
503-
prompt_tokens = atoi(engine->data + tokens[i + 1].start);
504-
} else if (klen == 23 && memcmp(key, "estimated_prompt_tokens", 23) == 0) {
505-
estimated_prompt_tokens = atoi(engine->data + tokens[i + 1].start);
548+
if (tokens[0].type != JSMN_OBJECT) {
549+
dbmem_context_set_error(engine->context, "Invalid API response shape");
550+
return -1;
551+
}
552+
553+
int output_dimension_index = dbmem_json_object_find(engine->data, tokens, 0, "output_dimension");
554+
if (output_dimension_index >= 0 && tokens[output_dimension_index].type == JSMN_PRIMITIVE) {
555+
n_embd = atoi(engine->data + tokens[output_dimension_index].start);
556+
}
557+
558+
int data_index = dbmem_json_object_find(engine->data, tokens, 0, "data");
559+
if (data_index < 0 || tokens[data_index].type != JSMN_ARRAY || tokens[data_index].size <= 0) {
560+
dbmem_context_set_error(engine->context, "Missing embedding data in API response");
561+
return -1;
562+
}
563+
564+
int item_index = data_index + 1;
565+
if (tokens[item_index].type != JSMN_OBJECT) {
566+
dbmem_context_set_error(engine->context, "Invalid embedding item in API response");
567+
return -1;
568+
}
569+
570+
int embedding_index = dbmem_json_object_find(engine->data, tokens, item_index, "embedding");
571+
if (embedding_index < 0 || tokens[embedding_index].type != JSMN_ARRAY) {
572+
dbmem_context_set_error(engine->context, "Missing embedding data in API response");
573+
return -1;
574+
}
575+
if (tokens[embedding_index].size <= 0) {
576+
dbmem_context_set_error(engine->context, "Invalid embedding array size in API response");
577+
return -1;
578+
}
579+
emb_count = (size_t)tokens[embedding_index].size;
580+
emb_start = embedding_index + 1;
581+
582+
int truncated_index = dbmem_json_object_find(engine->data, tokens, item_index, "truncated");
583+
if (truncated_index >= 0) {
584+
truncated = dbmem_json_parse_bool(engine->data, &tokens[truncated_index]);
585+
}
586+
587+
int usage_index = dbmem_json_object_find(engine->data, tokens, 0, "usage");
588+
if (usage_index >= 0 && tokens[usage_index].type == JSMN_OBJECT) {
589+
int request_tokens_index = dbmem_json_object_find(engine->data, tokens, usage_index, "request_tokens");
590+
if (request_tokens_index >= 0 && tokens[request_tokens_index].type == JSMN_PRIMITIVE) {
591+
request_tokens = atoi(engine->data + tokens[request_tokens_index].start);
506592
}
507593
}
508594

@@ -534,12 +620,12 @@ int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *t
534620

535621
// Fill result
536622
result->n_embd = n_embd;
537-
result->n_tokens = prompt_tokens;
538-
result->n_tokens_truncated = (estimated_prompt_tokens > prompt_tokens) ? estimated_prompt_tokens - prompt_tokens : 0;
623+
result->n_tokens = request_tokens;
624+
result->truncated = truncated;
539625
result->embedding = engine->embedding;
540626

541627
// Update statistics
542-
engine->total_tokens_processed += prompt_tokens;
628+
engine->total_tokens_processed += result->n_tokens;
543629
engine->total_embeddings_generated++;
544630

545631
return 0;

0 commit comments

Comments
 (0)