From ac158fb0aa7ba1eba3d6aa9fe443c55d8e6165f5 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 24 Nov 2025 14:44:40 +0000 Subject: [PATCH 1/3] fix(db-ivm): normalize Uint8Array keys in Index for content-based comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a bug where joining collections by small Uint8Array keys would fail because the keys were compared by reference instead of by content. The fix normalizes Uint8Array keys (≤128 bytes) to string representations before using them as Map keys, similar to the approach used in PR #779 for value comparison. This ensures that Uint8Array instances with the same byte content are treated as equal, even if they are different objects. Changes: - Added normalizeKey() function to convert small Uint8Arrays to string keys - Updated Index class to normalize keys in all Map operations - Maintained mapping from normalized keys to original keys for iteration - Added test case for joining collections with Uint8Array keys Fixes #896 --- packages/db-ivm/src/indexes.ts | 111 ++++++++++++++----- packages/db-ivm/tests/operators/join.test.ts | 68 ++++++++++++ 2 files changed, 152 insertions(+), 27 deletions(-) diff --git a/packages/db-ivm/src/indexes.ts b/packages/db-ivm/src/indexes.ts index 7dcb349ba..4bd403397 100644 --- a/packages/db-ivm/src/indexes.ts +++ b/packages/db-ivm/src/indexes.ts @@ -42,6 +42,39 @@ import type { Hash } from "./hashing/index.js" const NO_PREFIX = Symbol(`NO_PREFIX`) type NO_PREFIX = typeof NO_PREFIX +/** + * Threshold for normalizing Uint8Arrays to string representations. + * Arrays larger than this will use reference equality to avoid memory overhead. + * 128 bytes is enough for common ID formats (ULIDs are 16 bytes, UUIDs are 16 bytes) + * while avoiding excessive string allocation for large binary data. + */ +const UINT8ARRAY_NORMALIZE_THRESHOLD = 128 + +/** + * Normalize a key for Map usage. + * Converts Uint8Arrays/Buffers to string representations for content-based equality. + * This enables proper joining on binary keys like ULIDs. + */ +function normalizeKey(key: K): K | string { + // Check if the key is a Uint8Array or Buffer + const isUint8Array = + (typeof Buffer !== `undefined` && key instanceof Buffer) || + key instanceof Uint8Array + + if (isUint8Array) { + // Only normalize small arrays to avoid memory overhead for large binary data + if (key.byteLength <= UINT8ARRAY_NORMALIZE_THRESHOLD) { + // Convert to a string representation that can be used as a Map key + // Use a special prefix to avoid collisions with user strings + return `__u8__${Array.from(key).join(`,`)}` + } + // For large arrays, fall back to reference equality + // Users working with large binary data should use a derived key if needed + } + + return key +} + // A single value is a tuple of the value and the multiplicity. type SingleValue = [TValue, number] @@ -151,6 +184,7 @@ export class Index { */ #inner: IndexMap #consolidatedMultiplicity: Map = new Map() // sum of multiplicities per key + #originalKeys: Map = new Map() // map from normalized keys to original keys constructor() { this.#inner = new Map() @@ -200,7 +234,7 @@ export class Index { * @returns True if the index has the key, false otherwise. */ has(key: TKey): boolean { - return this.#inner.has(key) + return this.#inner.has(normalizeKey(key) as TKey) } /** @@ -209,7 +243,9 @@ export class Index { * @returns True if the key has non-zero consolidated multiplicity, false otherwise. */ hasPresence(key: TKey): boolean { - return (this.#consolidatedMultiplicity.get(key) || 0) !== 0 + return ( + (this.#consolidatedMultiplicity.get(normalizeKey(key) as TKey) || 0) !== 0 + ) } /** @@ -218,15 +254,18 @@ export class Index { * @returns The consolidated multiplicity for the key. */ getConsolidatedMultiplicity(key: TKey): number { - return this.#consolidatedMultiplicity.get(key) || 0 + return this.#consolidatedMultiplicity.get(normalizeKey(key) as TKey) || 0 } /** * Get all keys that have presence (non-zero consolidated multiplicity). * @returns An iterator of keys with non-zero consolidated multiplicity. */ - getPresenceKeys(): Iterable { - return this.#consolidatedMultiplicity.keys() + *getPresenceKeys(): Iterable { + for (const normalizedKey of this.#consolidatedMultiplicity.keys()) { + const originalKey = this.#originalKeys.get(normalizedKey) || normalizedKey + yield originalKey + } } /** @@ -244,7 +283,7 @@ export class Index { * @returns An iterator of value tuples [value, multiplicity]. */ *getIterator(key: TKey): Iterable<[TValue, number]> { - const mapOrSingleValue = this.#inner.get(key) + const mapOrSingleValue = this.#inner.get(normalizeKey(key) as TKey) if (isSingleValue(mapOrSingleValue)) { yield mapOrSingleValue } else if (mapOrSingleValue === undefined) { @@ -273,9 +312,10 @@ export class Index { * @returns An iterable of all key-value pairs (and their multiplicities) in the index. */ *entries(): Iterable<[TKey, [TValue, number]]> { - for (const key of this.#inner.keys()) { - for (const valueTuple of this.getIterator(key)) { - yield [key, valueTuple] + for (const normalizedKey of this.#inner.keys()) { + const originalKey = this.#originalKeys.get(normalizedKey) || normalizedKey + for (const valueTuple of this.getIterator(normalizedKey)) { + yield [originalKey, valueTuple] } } } @@ -287,8 +327,9 @@ export class Index { * @returns An iterator of all *keys* in the index and their corresponding value iterator. */ *entriesIterators(): Iterable<[TKey, Iterable<[TValue, number]>]> { - for (const key of this.#inner.keys()) { - yield [key, this.getIterator(key)] + for (const normalizedKey of this.#inner.keys()) { + const originalKey = this.#originalKeys.get(normalizedKey) || normalizedKey + yield [originalKey, this.getIterator(normalizedKey)] } } @@ -302,27 +343,40 @@ export class Index { // If the multiplicity is 0, do nothing if (multiplicity === 0) return + // Normalize the key for Map operations + const normalizedKey = normalizeKey(key) as TKey + + // Store the original key if this is a new key + if (!this.#originalKeys.has(normalizedKey)) { + this.#originalKeys.set(normalizedKey, key) + } + // Update consolidated multiplicity tracking const newConsolidatedMultiplicity = - (this.#consolidatedMultiplicity.get(key) || 0) + multiplicity + (this.#consolidatedMultiplicity.get(normalizedKey) || 0) + multiplicity if (newConsolidatedMultiplicity === 0) { - this.#consolidatedMultiplicity.delete(key) + this.#consolidatedMultiplicity.delete(normalizedKey) + // Also clean up the original key mapping when the key is completely removed + this.#originalKeys.delete(normalizedKey) } else { - this.#consolidatedMultiplicity.set(key, newConsolidatedMultiplicity) + this.#consolidatedMultiplicity.set( + normalizedKey, + newConsolidatedMultiplicity + ) } - const mapOrSingleValue = this.#inner.get(key) + const mapOrSingleValue = this.#inner.get(normalizedKey) if (mapOrSingleValue === undefined) { // First value for this key - this.#inner.set(key, valueTuple) + this.#inner.set(normalizedKey, valueTuple) return } if (isSingleValue(mapOrSingleValue)) { // Handle transition from single value to map this.#handleSingleValueTransition( - key, + normalizedKey, mapOrSingleValue, value, multiplicity @@ -338,28 +392,31 @@ export class Index { const prefixMap = new PrefixMap() prefixMap.set(NO_PREFIX, mapOrSingleValue) prefixMap.set(prefix, valueTuple) - this.#inner.set(key, prefixMap) + this.#inner.set(normalizedKey, prefixMap) } else { // Add to existing ValueMap const isEmpty = mapOrSingleValue.addValue(value, multiplicity) if (isEmpty) { - this.#inner.delete(key) + this.#inner.delete(normalizedKey) + this.#originalKeys.delete(normalizedKey) } } } else { // Handle existing PrefixMap const isEmpty = mapOrSingleValue.addValue(value, multiplicity) if (isEmpty) { - this.#inner.delete(key) + this.#inner.delete(normalizedKey) + this.#originalKeys.delete(normalizedKey) } } } /** * Handle the transition from a single value to either a ValueMap or PrefixMap + * Note: The key parameter should already be normalized before calling this method */ #handleSingleValueTransition( - key: TKey, + normalizedKey: TKey, currentSingleValue: SingleValue, newValue: TValue, multiplicity: number @@ -370,9 +427,9 @@ export class Index { if (currentValue === newValue) { const newMultiplicity = currentMultiplicity + multiplicity if (newMultiplicity === 0) { - this.#inner.delete(key) + this.#inner.delete(normalizedKey) } else { - this.#inner.set(key, [newValue, newMultiplicity]) + this.#inner.set(normalizedKey, [newValue, newMultiplicity]) } return } @@ -388,9 +445,9 @@ export class Index { ) { const newMultiplicity = currentMultiplicity + multiplicity if (newMultiplicity === 0) { - this.#inner.delete(key) + this.#inner.delete(normalizedKey) } else { - this.#inner.set(key, [newValue, newMultiplicity]) + this.#inner.set(normalizedKey, [newValue, newMultiplicity]) } return } @@ -401,7 +458,7 @@ export class Index { const valueMap = new ValueMap() valueMap.set(hash(currentValue), currentSingleValue) valueMap.set(hash(newValue), [newValue, multiplicity]) - this.#inner.set(key, valueMap) + this.#inner.set(normalizedKey, valueMap) } else { // At least one has a prefix, use PrefixMap const prefixMap = new PrefixMap() @@ -418,7 +475,7 @@ export class Index { prefixMap.set(newPrefix, [newValue, multiplicity]) } - this.#inner.set(key, prefixMap) + this.#inner.set(normalizedKey, prefixMap) } } diff --git a/packages/db-ivm/tests/operators/join.test.ts b/packages/db-ivm/tests/operators/join.test.ts index 94d7975cf..6e39c3977 100644 --- a/packages/db-ivm/tests/operators/join.test.ts +++ b/packages/db-ivm/tests/operators/join.test.ts @@ -12,6 +12,7 @@ import { describe(`Operators`, () => { describe(`Join operation`, () => { testJoin() + testUint8ArrayKeyJoin() }) }) @@ -342,3 +343,70 @@ function testJoin() { assertKeyedResults(`join batch processing`, batchResult, expectedResults, 6) }) } + +function testUint8ArrayKeyJoin() { + test(`join with Uint8Array keys compared by content`, () => { + const graph = new D2() + const inputA = graph.newInput<[Uint8Array, string]>() + const inputB = graph.newInput<[Uint8Array, string]>() + const results: Array<[Uint8Array, [string, string]]> = [] + + inputA.pipe( + join(inputB), + output((message) => { + for (const [item] of message.getInner()) { + results.push(item) + } + }) + ) + + graph.finalize() + + // Create separate Uint8Array instances with the same content + const key1A = new Uint8Array([1, 2, 3, 4]) + const key1B = new Uint8Array([1, 2, 3, 4]) + const key2A = new Uint8Array([5, 6, 7, 8]) + const key2B = new Uint8Array([5, 6, 7, 8]) + + // Verify that the arrays are different objects + expect(key1A).not.toBe(key1B) + expect(key2A).not.toBe(key2B) + + // Send data with different Uint8Array instances but same content + inputA.sendData( + new MultiSet([ + [[key1A, `a`], 1], + [[key2A, `b`], 1], + ]) + ) + + inputB.sendData( + new MultiSet([ + [[key1B, `x`], 1], + [[key2B, `y`], 1], + ]) + ) + + graph.run() + + // Should join successfully based on content, not reference + expect(results).toHaveLength(2) + + // Verify the joined data is correct + const sortedResults = results.sort((a, b) => { + const [keyA] = a + const [keyB] = b + return keyA[0]! - keyB[0]! + }) + + const [key1, [val1A, val1B]] = sortedResults[0]! + expect(Array.from(key1)).toEqual([1, 2, 3, 4]) + expect(val1A).toBe(`a`) + expect(val1B).toBe(`x`) + + const [key2, [val2A, val2B]] = sortedResults[1]! + expect(Array.from(key2)).toEqual([5, 6, 7, 8]) + expect(val2A).toBe(`b`) + expect(val2B).toBe(`y`) + }) +} From 01a06b5253a501e452f4d8961fa9143f3d78b796 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 24 Nov 2025 16:02:45 +0000 Subject: [PATCH 2/3] refactor: create shared Uint8Array normalization utilities This refactoring eliminates code duplication by creating shared utilities for Uint8Array normalization that can be used across both db-ivm and db packages. Changes: - Created new module: packages/db-ivm/src/utils/uint8array.ts - Exports UINT8ARRAY_NORMALIZE_THRESHOLD constant (128 bytes) - Exports isUint8Array() helper function - Exports normalizeUint8Array() for byte array to string conversion - Exports normalizeValue() for generic value normalization - Updated packages/db-ivm/src/indexes.ts: - Removed local normalizeKey() function - Now imports and uses normalizeValue from shared utilities - Updated packages/db/src/utils/comparison.ts: - Removed duplicate UINT8ARRAY_NORMALIZE_THRESHOLD constant - Removed duplicate isUint8Array check logic - Now imports and uses shared utilities from @tanstack/db-ivm - Updated packages/db-ivm/src/index.ts: - Added export for utils/uint8array module Benefits: - Single source of truth for Uint8Array normalization logic - Easier maintenance and consistency across packages - Follows DRY principle All tests pass (268 tests in db-ivm, 1699 tests in db). --- packages/db-ivm/src/index.ts | 1 + packages/db-ivm/src/indexes.ts | 34 +--------------- packages/db-ivm/src/utils/uint8array.ts | 52 +++++++++++++++++++++++++ packages/db/src/utils/comparison.ts | 40 +++++-------------- 4 files changed, 64 insertions(+), 63 deletions(-) create mode 100644 packages/db-ivm/src/utils/uint8array.ts diff --git a/packages/db-ivm/src/index.ts b/packages/db-ivm/src/index.ts index 5a605a1be..6e231a0cd 100644 --- a/packages/db-ivm/src/index.ts +++ b/packages/db-ivm/src/index.ts @@ -2,3 +2,4 @@ export * from "./d2.js" export * from "./multiset.js" export * from "./operators/index.js" export * from "./types.js" +export * from "./utils/uint8array.js" diff --git a/packages/db-ivm/src/indexes.ts b/packages/db-ivm/src/indexes.ts index 4bd403397..c34d346b4 100644 --- a/packages/db-ivm/src/indexes.ts +++ b/packages/db-ivm/src/indexes.ts @@ -35,6 +35,7 @@ import { MultiSet } from "./multiset.js" import { hash } from "./hashing/index.js" +import { normalizeValue as normalizeKey } from "./utils/uint8array.js" import type { Hash } from "./hashing/index.js" // We use a symbol to represent the absence of a prefix, unprefixed values a stored @@ -42,39 +43,6 @@ import type { Hash } from "./hashing/index.js" const NO_PREFIX = Symbol(`NO_PREFIX`) type NO_PREFIX = typeof NO_PREFIX -/** - * Threshold for normalizing Uint8Arrays to string representations. - * Arrays larger than this will use reference equality to avoid memory overhead. - * 128 bytes is enough for common ID formats (ULIDs are 16 bytes, UUIDs are 16 bytes) - * while avoiding excessive string allocation for large binary data. - */ -const UINT8ARRAY_NORMALIZE_THRESHOLD = 128 - -/** - * Normalize a key for Map usage. - * Converts Uint8Arrays/Buffers to string representations for content-based equality. - * This enables proper joining on binary keys like ULIDs. - */ -function normalizeKey(key: K): K | string { - // Check if the key is a Uint8Array or Buffer - const isUint8Array = - (typeof Buffer !== `undefined` && key instanceof Buffer) || - key instanceof Uint8Array - - if (isUint8Array) { - // Only normalize small arrays to avoid memory overhead for large binary data - if (key.byteLength <= UINT8ARRAY_NORMALIZE_THRESHOLD) { - // Convert to a string representation that can be used as a Map key - // Use a special prefix to avoid collisions with user strings - return `__u8__${Array.from(key).join(`,`)}` - } - // For large arrays, fall back to reference equality - // Users working with large binary data should use a derived key if needed - } - - return key -} - // A single value is a tuple of the value and the multiplicity. type SingleValue = [TValue, number] diff --git a/packages/db-ivm/src/utils/uint8array.ts b/packages/db-ivm/src/utils/uint8array.ts new file mode 100644 index 000000000..bc444c6d3 --- /dev/null +++ b/packages/db-ivm/src/utils/uint8array.ts @@ -0,0 +1,52 @@ +/** + * Threshold for normalizing Uint8Arrays to string representations. + * Arrays larger than this will use reference equality to avoid memory overhead. + * 128 bytes is enough for common ID formats (ULIDs are 16 bytes, UUIDs are 16 bytes) + * while avoiding excessive string allocation for large binary data. + */ +export const UINT8ARRAY_NORMALIZE_THRESHOLD = 128 + +/** + * Check if a value is a Uint8Array or Buffer + */ +export function isUint8Array(value: unknown): value is Uint8Array { + return ( + (typeof Buffer !== `undefined` && value instanceof Buffer) || + value instanceof Uint8Array + ) +} + +/** + * Normalize a Uint8Array to a string representation for content-based comparison. + * This enables Uint8Arrays with the same byte content to be treated as equal, + * even if they are different object instances. + * + * @param value - The Uint8Array or Buffer to normalize + * @returns A string representation of the byte array + */ +export function normalizeUint8Array(value: Uint8Array): string { + // Convert to a string representation that can be used as a Map key + // Use a special prefix to avoid collisions with user strings + return `__u8__${Array.from(value).join(`,`)}` +} + +/** + * Normalize a value for Map key or comparison usage. + * Converts small Uint8Arrays/Buffers to string representations for content-based equality. + * This enables proper comparison and Map key usage for binary data like ULIDs. + * + * @param value - The value to normalize + * @returns The normalized value (string for small Uint8Arrays, original value otherwise) + */ +export function normalizeValue(value: T): T | string { + if (isUint8Array(value)) { + // Only normalize small arrays to avoid memory overhead for large binary data + if (value.byteLength <= UINT8ARRAY_NORMALIZE_THRESHOLD) { + return normalizeUint8Array(value) + } + // For large arrays, fall back to reference equality + // Users working with large binary data should use a derived key if needed + } + + return value +} diff --git a/packages/db/src/utils/comparison.ts b/packages/db/src/utils/comparison.ts index 1e4ead81f..f016d4ea1 100644 --- a/packages/db/src/utils/comparison.ts +++ b/packages/db/src/utils/comparison.ts @@ -1,3 +1,7 @@ +import { + isUint8Array, + normalizeValue as normalizeUint8ArrayValue, +} from "@tanstack/db-ivm" import type { CompareOptions } from "../query/builder/types" // WeakMap to store stable IDs for objects @@ -126,14 +130,6 @@ function areUint8ArraysEqual(a: Uint8Array, b: Uint8Array): boolean { return true } -/** - * Threshold for normalizing Uint8Arrays to string representations. - * Arrays larger than this will use reference equality to avoid memory overhead. - * 128 bytes is enough for common ID formats (ULIDs are 16 bytes, UUIDs are 16 bytes) - * while avoiding excessive string allocation for large binary data. - */ -const UINT8ARRAY_NORMALIZE_THRESHOLD = 128 - /** * Normalize a value for comparison and Map key usage * Converts values that can't be directly compared or used as Map keys @@ -144,21 +140,9 @@ export function normalizeValue(value: any): any { return value.getTime() } - // Normalize Uint8Arrays/Buffers to a string representation for Map key usage - // This enables content-based equality for binary data like ULIDs - const isUint8Array = - (typeof Buffer !== `undefined` && value instanceof Buffer) || - value instanceof Uint8Array - - if (isUint8Array) { - // Only normalize small arrays to avoid memory overhead for large binary data - if (value.byteLength <= UINT8ARRAY_NORMALIZE_THRESHOLD) { - // Convert to a string representation that can be used as a Map key - // Use a special prefix to avoid collisions with user strings - return `__u8__${Array.from(value).join(`,`)}` - } - // For large arrays, fall back to reference equality - // Users working with large binary data should use a derived key if needed + // Use shared Uint8Array normalization from db-ivm + if (isUint8Array(value)) { + return normalizeUint8ArrayValue(value) } return value @@ -173,13 +157,9 @@ export function areValuesEqual(a: any, b: any): boolean { return true } - // Check for Uint8Array/Buffer comparison - const aIsUint8Array = - (typeof Buffer !== `undefined` && a instanceof Buffer) || - a instanceof Uint8Array - const bIsUint8Array = - (typeof Buffer !== `undefined` && b instanceof Buffer) || - b instanceof Uint8Array + // Check for Uint8Array/Buffer comparison using shared utility + const aIsUint8Array = isUint8Array(a) + const bIsUint8Array = isUint8Array(b) // If both are Uint8Arrays, compare by content if (aIsUint8Array && bIsUint8Array) { From 457a409903c927cc9abd6af87d5ce35222478170 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 24 Nov 2025 16:46:28 +0000 Subject: [PATCH 3/3] chore: add changeset for Uint8Array join key fix --- .changeset/fix-uint8array-join-keys.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .changeset/fix-uint8array-join-keys.md diff --git a/.changeset/fix-uint8array-join-keys.md b/.changeset/fix-uint8array-join-keys.md new file mode 100644 index 000000000..884074347 --- /dev/null +++ b/.changeset/fix-uint8array-join-keys.md @@ -0,0 +1,12 @@ +--- +"@tanstack/db-ivm": patch +"@tanstack/db": patch +--- + +Fix joining collections by small Uint8Array keys to use content-based comparison instead of reference equality. + +Previously, when joining collections using Uint8Array keys (like ULIDs or UUIDs), the Index class would compare keys by reference rather than by content. This caused joins to fail when the same byte array data existed in separate instances. + +This fix introduces shared Uint8Array normalization utilities that convert small Uint8Arrays (≤128 bytes) to string representations for content-based equality checking. The normalization logic is now shared between `db-ivm` and `db` packages, eliminating code duplication. + +Fixes #896