diff --git a/.changeset/fix-uint8array-join-keys.md b/.changeset/fix-uint8array-join-keys.md new file mode 100644 index 000000000..884074347 --- /dev/null +++ b/.changeset/fix-uint8array-join-keys.md @@ -0,0 +1,12 @@ +--- +"@tanstack/db-ivm": patch +"@tanstack/db": patch +--- + +Fix joining collections by small Uint8Array keys to use content-based comparison instead of reference equality. + +Previously, when joining collections using Uint8Array keys (like ULIDs or UUIDs), the Index class would compare keys by reference rather than by content. This caused joins to fail when the same byte array data existed in separate instances. + +This fix introduces shared Uint8Array normalization utilities that convert small Uint8Arrays (≤128 bytes) to string representations for content-based equality checking. The normalization logic is now shared between `db-ivm` and `db` packages, eliminating code duplication. + +Fixes #896 diff --git a/packages/db-ivm/src/index.ts b/packages/db-ivm/src/index.ts index 5a605a1be..6e231a0cd 100644 --- a/packages/db-ivm/src/index.ts +++ b/packages/db-ivm/src/index.ts @@ -2,3 +2,4 @@ export * from "./d2.js" export * from "./multiset.js" export * from "./operators/index.js" export * from "./types.js" +export * from "./utils/uint8array.js" diff --git a/packages/db-ivm/src/indexes.ts b/packages/db-ivm/src/indexes.ts index 7dcb349ba..c34d346b4 100644 --- a/packages/db-ivm/src/indexes.ts +++ b/packages/db-ivm/src/indexes.ts @@ -35,6 +35,7 @@ import { MultiSet } from "./multiset.js" import { hash } from "./hashing/index.js" +import { normalizeValue as normalizeKey } from "./utils/uint8array.js" import type { Hash } from "./hashing/index.js" // We use a symbol to represent the absence of a prefix, unprefixed values a stored @@ -151,6 +152,7 @@ export class Index { */ #inner: IndexMap #consolidatedMultiplicity: Map = new Map() // sum of multiplicities per key + #originalKeys: Map = new Map() // map from normalized keys to original keys constructor() { this.#inner = new Map() @@ -200,7 +202,7 @@ export class Index { * @returns True if the index has the key, false otherwise. */ has(key: TKey): boolean { - return this.#inner.has(key) + return this.#inner.has(normalizeKey(key) as TKey) } /** @@ -209,7 +211,9 @@ export class Index { * @returns True if the key has non-zero consolidated multiplicity, false otherwise. */ hasPresence(key: TKey): boolean { - return (this.#consolidatedMultiplicity.get(key) || 0) !== 0 + return ( + (this.#consolidatedMultiplicity.get(normalizeKey(key) as TKey) || 0) !== 0 + ) } /** @@ -218,15 +222,18 @@ export class Index { * @returns The consolidated multiplicity for the key. */ getConsolidatedMultiplicity(key: TKey): number { - return this.#consolidatedMultiplicity.get(key) || 0 + return this.#consolidatedMultiplicity.get(normalizeKey(key) as TKey) || 0 } /** * Get all keys that have presence (non-zero consolidated multiplicity). * @returns An iterator of keys with non-zero consolidated multiplicity. */ - getPresenceKeys(): Iterable { - return this.#consolidatedMultiplicity.keys() + *getPresenceKeys(): Iterable { + for (const normalizedKey of this.#consolidatedMultiplicity.keys()) { + const originalKey = this.#originalKeys.get(normalizedKey) || normalizedKey + yield originalKey + } } /** @@ -244,7 +251,7 @@ export class Index { * @returns An iterator of value tuples [value, multiplicity]. */ *getIterator(key: TKey): Iterable<[TValue, number]> { - const mapOrSingleValue = this.#inner.get(key) + const mapOrSingleValue = this.#inner.get(normalizeKey(key) as TKey) if (isSingleValue(mapOrSingleValue)) { yield mapOrSingleValue } else if (mapOrSingleValue === undefined) { @@ -273,9 +280,10 @@ export class Index { * @returns An iterable of all key-value pairs (and their multiplicities) in the index. */ *entries(): Iterable<[TKey, [TValue, number]]> { - for (const key of this.#inner.keys()) { - for (const valueTuple of this.getIterator(key)) { - yield [key, valueTuple] + for (const normalizedKey of this.#inner.keys()) { + const originalKey = this.#originalKeys.get(normalizedKey) || normalizedKey + for (const valueTuple of this.getIterator(normalizedKey)) { + yield [originalKey, valueTuple] } } } @@ -287,8 +295,9 @@ export class Index { * @returns An iterator of all *keys* in the index and their corresponding value iterator. */ *entriesIterators(): Iterable<[TKey, Iterable<[TValue, number]>]> { - for (const key of this.#inner.keys()) { - yield [key, this.getIterator(key)] + for (const normalizedKey of this.#inner.keys()) { + const originalKey = this.#originalKeys.get(normalizedKey) || normalizedKey + yield [originalKey, this.getIterator(normalizedKey)] } } @@ -302,27 +311,40 @@ export class Index { // If the multiplicity is 0, do nothing if (multiplicity === 0) return + // Normalize the key for Map operations + const normalizedKey = normalizeKey(key) as TKey + + // Store the original key if this is a new key + if (!this.#originalKeys.has(normalizedKey)) { + this.#originalKeys.set(normalizedKey, key) + } + // Update consolidated multiplicity tracking const newConsolidatedMultiplicity = - (this.#consolidatedMultiplicity.get(key) || 0) + multiplicity + (this.#consolidatedMultiplicity.get(normalizedKey) || 0) + multiplicity if (newConsolidatedMultiplicity === 0) { - this.#consolidatedMultiplicity.delete(key) + this.#consolidatedMultiplicity.delete(normalizedKey) + // Also clean up the original key mapping when the key is completely removed + this.#originalKeys.delete(normalizedKey) } else { - this.#consolidatedMultiplicity.set(key, newConsolidatedMultiplicity) + this.#consolidatedMultiplicity.set( + normalizedKey, + newConsolidatedMultiplicity + ) } - const mapOrSingleValue = this.#inner.get(key) + const mapOrSingleValue = this.#inner.get(normalizedKey) if (mapOrSingleValue === undefined) { // First value for this key - this.#inner.set(key, valueTuple) + this.#inner.set(normalizedKey, valueTuple) return } if (isSingleValue(mapOrSingleValue)) { // Handle transition from single value to map this.#handleSingleValueTransition( - key, + normalizedKey, mapOrSingleValue, value, multiplicity @@ -338,28 +360,31 @@ export class Index { const prefixMap = new PrefixMap() prefixMap.set(NO_PREFIX, mapOrSingleValue) prefixMap.set(prefix, valueTuple) - this.#inner.set(key, prefixMap) + this.#inner.set(normalizedKey, prefixMap) } else { // Add to existing ValueMap const isEmpty = mapOrSingleValue.addValue(value, multiplicity) if (isEmpty) { - this.#inner.delete(key) + this.#inner.delete(normalizedKey) + this.#originalKeys.delete(normalizedKey) } } } else { // Handle existing PrefixMap const isEmpty = mapOrSingleValue.addValue(value, multiplicity) if (isEmpty) { - this.#inner.delete(key) + this.#inner.delete(normalizedKey) + this.#originalKeys.delete(normalizedKey) } } } /** * Handle the transition from a single value to either a ValueMap or PrefixMap + * Note: The key parameter should already be normalized before calling this method */ #handleSingleValueTransition( - key: TKey, + normalizedKey: TKey, currentSingleValue: SingleValue, newValue: TValue, multiplicity: number @@ -370,9 +395,9 @@ export class Index { if (currentValue === newValue) { const newMultiplicity = currentMultiplicity + multiplicity if (newMultiplicity === 0) { - this.#inner.delete(key) + this.#inner.delete(normalizedKey) } else { - this.#inner.set(key, [newValue, newMultiplicity]) + this.#inner.set(normalizedKey, [newValue, newMultiplicity]) } return } @@ -388,9 +413,9 @@ export class Index { ) { const newMultiplicity = currentMultiplicity + multiplicity if (newMultiplicity === 0) { - this.#inner.delete(key) + this.#inner.delete(normalizedKey) } else { - this.#inner.set(key, [newValue, newMultiplicity]) + this.#inner.set(normalizedKey, [newValue, newMultiplicity]) } return } @@ -401,7 +426,7 @@ export class Index { const valueMap = new ValueMap() valueMap.set(hash(currentValue), currentSingleValue) valueMap.set(hash(newValue), [newValue, multiplicity]) - this.#inner.set(key, valueMap) + this.#inner.set(normalizedKey, valueMap) } else { // At least one has a prefix, use PrefixMap const prefixMap = new PrefixMap() @@ -418,7 +443,7 @@ export class Index { prefixMap.set(newPrefix, [newValue, multiplicity]) } - this.#inner.set(key, prefixMap) + this.#inner.set(normalizedKey, prefixMap) } } diff --git a/packages/db-ivm/src/utils/uint8array.ts b/packages/db-ivm/src/utils/uint8array.ts new file mode 100644 index 000000000..bc444c6d3 --- /dev/null +++ b/packages/db-ivm/src/utils/uint8array.ts @@ -0,0 +1,52 @@ +/** + * Threshold for normalizing Uint8Arrays to string representations. + * Arrays larger than this will use reference equality to avoid memory overhead. + * 128 bytes is enough for common ID formats (ULIDs are 16 bytes, UUIDs are 16 bytes) + * while avoiding excessive string allocation for large binary data. + */ +export const UINT8ARRAY_NORMALIZE_THRESHOLD = 128 + +/** + * Check if a value is a Uint8Array or Buffer + */ +export function isUint8Array(value: unknown): value is Uint8Array { + return ( + (typeof Buffer !== `undefined` && value instanceof Buffer) || + value instanceof Uint8Array + ) +} + +/** + * Normalize a Uint8Array to a string representation for content-based comparison. + * This enables Uint8Arrays with the same byte content to be treated as equal, + * even if they are different object instances. + * + * @param value - The Uint8Array or Buffer to normalize + * @returns A string representation of the byte array + */ +export function normalizeUint8Array(value: Uint8Array): string { + // Convert to a string representation that can be used as a Map key + // Use a special prefix to avoid collisions with user strings + return `__u8__${Array.from(value).join(`,`)}` +} + +/** + * Normalize a value for Map key or comparison usage. + * Converts small Uint8Arrays/Buffers to string representations for content-based equality. + * This enables proper comparison and Map key usage for binary data like ULIDs. + * + * @param value - The value to normalize + * @returns The normalized value (string for small Uint8Arrays, original value otherwise) + */ +export function normalizeValue(value: T): T | string { + if (isUint8Array(value)) { + // Only normalize small arrays to avoid memory overhead for large binary data + if (value.byteLength <= UINT8ARRAY_NORMALIZE_THRESHOLD) { + return normalizeUint8Array(value) + } + // For large arrays, fall back to reference equality + // Users working with large binary data should use a derived key if needed + } + + return value +} diff --git a/packages/db-ivm/tests/operators/join.test.ts b/packages/db-ivm/tests/operators/join.test.ts index 94d7975cf..6e39c3977 100644 --- a/packages/db-ivm/tests/operators/join.test.ts +++ b/packages/db-ivm/tests/operators/join.test.ts @@ -12,6 +12,7 @@ import { describe(`Operators`, () => { describe(`Join operation`, () => { testJoin() + testUint8ArrayKeyJoin() }) }) @@ -342,3 +343,70 @@ function testJoin() { assertKeyedResults(`join batch processing`, batchResult, expectedResults, 6) }) } + +function testUint8ArrayKeyJoin() { + test(`join with Uint8Array keys compared by content`, () => { + const graph = new D2() + const inputA = graph.newInput<[Uint8Array, string]>() + const inputB = graph.newInput<[Uint8Array, string]>() + const results: Array<[Uint8Array, [string, string]]> = [] + + inputA.pipe( + join(inputB), + output((message) => { + for (const [item] of message.getInner()) { + results.push(item) + } + }) + ) + + graph.finalize() + + // Create separate Uint8Array instances with the same content + const key1A = new Uint8Array([1, 2, 3, 4]) + const key1B = new Uint8Array([1, 2, 3, 4]) + const key2A = new Uint8Array([5, 6, 7, 8]) + const key2B = new Uint8Array([5, 6, 7, 8]) + + // Verify that the arrays are different objects + expect(key1A).not.toBe(key1B) + expect(key2A).not.toBe(key2B) + + // Send data with different Uint8Array instances but same content + inputA.sendData( + new MultiSet([ + [[key1A, `a`], 1], + [[key2A, `b`], 1], + ]) + ) + + inputB.sendData( + new MultiSet([ + [[key1B, `x`], 1], + [[key2B, `y`], 1], + ]) + ) + + graph.run() + + // Should join successfully based on content, not reference + expect(results).toHaveLength(2) + + // Verify the joined data is correct + const sortedResults = results.sort((a, b) => { + const [keyA] = a + const [keyB] = b + return keyA[0]! - keyB[0]! + }) + + const [key1, [val1A, val1B]] = sortedResults[0]! + expect(Array.from(key1)).toEqual([1, 2, 3, 4]) + expect(val1A).toBe(`a`) + expect(val1B).toBe(`x`) + + const [key2, [val2A, val2B]] = sortedResults[1]! + expect(Array.from(key2)).toEqual([5, 6, 7, 8]) + expect(val2A).toBe(`b`) + expect(val2B).toBe(`y`) + }) +} diff --git a/packages/db/src/utils/comparison.ts b/packages/db/src/utils/comparison.ts index 1e4ead81f..f016d4ea1 100644 --- a/packages/db/src/utils/comparison.ts +++ b/packages/db/src/utils/comparison.ts @@ -1,3 +1,7 @@ +import { + isUint8Array, + normalizeValue as normalizeUint8ArrayValue, +} from "@tanstack/db-ivm" import type { CompareOptions } from "../query/builder/types" // WeakMap to store stable IDs for objects @@ -126,14 +130,6 @@ function areUint8ArraysEqual(a: Uint8Array, b: Uint8Array): boolean { return true } -/** - * Threshold for normalizing Uint8Arrays to string representations. - * Arrays larger than this will use reference equality to avoid memory overhead. - * 128 bytes is enough for common ID formats (ULIDs are 16 bytes, UUIDs are 16 bytes) - * while avoiding excessive string allocation for large binary data. - */ -const UINT8ARRAY_NORMALIZE_THRESHOLD = 128 - /** * Normalize a value for comparison and Map key usage * Converts values that can't be directly compared or used as Map keys @@ -144,21 +140,9 @@ export function normalizeValue(value: any): any { return value.getTime() } - // Normalize Uint8Arrays/Buffers to a string representation for Map key usage - // This enables content-based equality for binary data like ULIDs - const isUint8Array = - (typeof Buffer !== `undefined` && value instanceof Buffer) || - value instanceof Uint8Array - - if (isUint8Array) { - // Only normalize small arrays to avoid memory overhead for large binary data - if (value.byteLength <= UINT8ARRAY_NORMALIZE_THRESHOLD) { - // Convert to a string representation that can be used as a Map key - // Use a special prefix to avoid collisions with user strings - return `__u8__${Array.from(value).join(`,`)}` - } - // For large arrays, fall back to reference equality - // Users working with large binary data should use a derived key if needed + // Use shared Uint8Array normalization from db-ivm + if (isUint8Array(value)) { + return normalizeUint8ArrayValue(value) } return value @@ -173,13 +157,9 @@ export function areValuesEqual(a: any, b: any): boolean { return true } - // Check for Uint8Array/Buffer comparison - const aIsUint8Array = - (typeof Buffer !== `undefined` && a instanceof Buffer) || - a instanceof Uint8Array - const bIsUint8Array = - (typeof Buffer !== `undefined` && b instanceof Buffer) || - b instanceof Uint8Array + // Check for Uint8Array/Buffer comparison using shared utility + const aIsUint8Array = isUint8Array(a) + const bIsUint8Array = isUint8Array(b) // If both are Uint8Arrays, compare by content if (aIsUint8Array && bIsUint8Array) {