Skip to content

Commit 38b7fa6

Browse files
committed
feat: add schema parameter to tableFromArrays and new recordBatchFromArrays factory
Allow callers to pass an explicit Schema to tableFromArrays() and a new recordBatchFromArrays() function, giving control over column types, ordering, nullability, and metadata instead of relying solely on type inference. Also adds a fast path in vectorFromArray for TypedArray-to-typed-vector coercion with BigInt boundary validation.
1 parent 15b0f0b commit 38b7fa6

7 files changed

Lines changed: 389 additions & 12 deletions

File tree

src/Arrow.dom.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ export {
7474
tableFromIPC, tableToIPC,
7575
MessageReader, AsyncMessageReader, JSONMessageReader,
7676
Message,
77-
RecordBatch,
77+
RecordBatch, recordBatchFromArrays,
7878
util,
7979
Builder, makeBuilder, builderThroughIterable, builderThroughAsyncIterable,
8080
compressionRegistry, CompressionType,

src/Arrow.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ export { compressionRegistry } from './ipc/compression/registry.js';
9999
export type { Codec } from './ipc/compression/registry.js';
100100
export { MessageReader, AsyncMessageReader, JSONMessageReader } from './ipc/message.js';
101101
export { Message } from './ipc/metadata/message.js';
102-
export { RecordBatch } from './recordbatch.js';
102+
export { RecordBatch, recordBatchFromArrays } from './recordbatch.js';
103103
export type { ArrowJSONLike, FileHandle, Readable, Writable, ReadableWritable, ReadableDOMStreamOptions } from './io/interfaces.js';
104104

105105
export {

src/factories.ts

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,35 @@ export function vectorFromArray<T extends dtypes.DataType>(data: DataProps<T>):
8080
export function vectorFromArray<T extends TypedArray | BigIntArray | readonly unknown[]>(data: T): Vector<ArrayDataType<T>>;
8181

8282
export function vectorFromArray(init: any, type?: dtypes.DataType) {
83-
if (init instanceof Data || init instanceof Vector || init.type instanceof dtypes.DataType || ArrayBuffer.isView(init)) {
83+
if (init instanceof Data || init instanceof Vector || init.type instanceof dtypes.DataType) {
8484
return makeVector(init as any);
8585
}
86+
if (ArrayBuffer.isView(init) && !type) {
87+
return makeVector(init as any);
88+
}
89+
if (ArrayBuffer.isView(init) && type) {
90+
// Validate BigInt/number boundary
91+
const isBigIntInput = init instanceof BigInt64Array || init instanceof BigUint64Array;
92+
const isBigIntTarget = type.ArrayType === BigInt64Array || type.ArrayType === BigUint64Array;
93+
if (isBigIntInput && !isBigIntTarget) {
94+
throw new TypeError(
95+
`Cannot convert BigInt input to ${type}. BigInt arrays can only target BigInt-based types (e.g. Int64, Uint64).`
96+
);
97+
}
98+
if (!isBigIntInput && isBigIntTarget) {
99+
throw new TypeError(
100+
`Cannot convert non-BigInt input to ${type}. ${type} requires BigInt values.`
101+
);
102+
}
103+
104+
// Fast path: direct TypedArray conversion for Int and Float types
105+
if (dtypes.DataType.isInt(type) || dtypes.DataType.isFloat(type)) {
106+
const data = init.constructor === type.ArrayType
107+
? init // zero-copy, same TypedArray type
108+
: new (type.ArrayType as any)(init); // standard JS TypedArray conversion
109+
return makeVector({ type, data, offset: 0, length: data.length, nullCount: 0 } as any);
110+
}
111+
}
86112
const options: IterableBuilderOptions = { type: type ?? inferType(init), nullValues: [null] };
87113
const chunks = [...builderThroughIterable(options)(init)];
88114
const vector = chunks.length === 1 ? chunks[0] : chunks.reduce((a, b) => a.concat(b));

src/recordbatch.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import { Vector } from './vector.js';
2121
import { Schema, Field } from './schema.js';
2222
import { DataType, Struct, Null, TypeMap } from './type.js';
2323
import { wrapIndex } from './util/vector.js';
24+
import { vectorFromArray } from './factories.js';
25+
import { ArrayDataType, BigIntArray, TypedArray } from './interfaces.js';
2426

2527
import { instance as getVisitor } from './visitor/get.js';
2628
import { instance as setVisitor } from './visitor/set.js';
@@ -306,6 +308,61 @@ Object.defineProperty(RecordBatch, Symbol.hasInstance, {
306308
},
307309
});
308310

311+
/**
312+
* Creates a new RecordBatch from an object of typed arrays or JavaScript arrays.
313+
*
314+
* @example
315+
* ```ts
316+
* const batch = recordBatchFromArrays({
317+
* a: [1, 2, 3],
318+
* b: new Int8Array([1, 2, 3]),
319+
* });
320+
* ```
321+
*
322+
* @example
323+
* ```ts
324+
* const schema = new Schema([
325+
* new Field('a', new Int32),
326+
* new Field('b', new Utf8),
327+
* ]);
328+
* const batch = recordBatchFromArrays({ a: [1, 2, 3], b: ['x', 'y', 'z'] }, schema);
329+
* ```
330+
*
331+
* @param input An object mapping column names to typed arrays or JavaScript arrays.
332+
* @param schema Optional schema to control column types, ordering, nullability, and metadata.
333+
* @returns A new RecordBatch.
334+
*/
335+
export function recordBatchFromArrays<T extends TypeMap>(
336+
input: Record<string, TypedArray | BigIntArray | readonly unknown[]>,
337+
schema: Schema<T>
338+
): RecordBatch<T>;
339+
export function recordBatchFromArrays<I extends Record<string | number | symbol, TypedArray | BigIntArray | readonly unknown[]>>(
340+
input: I
341+
): RecordBatch<{ [P in keyof I]: ArrayDataType<I[P]> }>;
342+
export function recordBatchFromArrays(
343+
input: Record<string, TypedArray | BigIntArray | readonly unknown[]>,
344+
schema?: Schema
345+
): RecordBatch {
346+
if (schema) {
347+
const children: Data[] = [];
348+
for (const field of schema.fields) {
349+
const col = input[field.name];
350+
if (col === undefined) {
351+
throw new TypeError(
352+
`Schema field "${field.name}" not found in input. ` +
353+
`Available keys: [${Object.keys(input).join(', ')}]`
354+
);
355+
}
356+
children.push(vectorFromArray(col as any, field.type).data[0]);
357+
}
358+
return new RecordBatch(schema, makeData({ type: new Struct(schema.fields), children }));
359+
}
360+
const dataMap: Record<string, Data> = {};
361+
for (const [key, col] of Object.entries(input)) {
362+
dataMap[key] = vectorFromArray(col).data[0];
363+
}
364+
return new RecordBatch(dataMap as any);
365+
}
309366

310367
/** @ignore */
311368
function ensureSameLengthData<T extends TypeMap = any>(

src/table.ts

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -454,15 +454,48 @@ export function makeTable<I extends Record<string | number | symbol, TypedArray>
454454
* })
455455
* ```
456456
*
457-
* @param input Input an object of typed arrays or JavaScript arrays.
457+
* @example
458+
* ```ts
459+
* const schema = new Schema([
460+
* new Field('a', new Int32),
461+
* new Field('b', new Utf8),
462+
* ]);
463+
* const table = tableFromArrays({ a: [1, 2, 3], b: ['x', 'y', 'z'] }, schema);
464+
* ```
465+
*
466+
* @param input An object mapping column names to typed arrays or JavaScript arrays.
467+
* @param schema Optional schema to control column types, ordering, nullability, and metadata.
458468
* @returns A new Table.
459469
*/
460-
export function tableFromArrays<I extends Record<string | number | symbol, TypedArray | BigIntArray | readonly unknown[]>>(input: I): Table<{ [P in keyof I]: ArrayDataType<I[P]> }> {
461-
type T = { [P in keyof I]: ArrayDataType<I[P]> };
462-
const vecs = {} as VectorsMap<T>;
463-
const inputs = Object.entries(input) as [keyof I, I[keyof I]][];
464-
for (const [key, col] of inputs) {
470+
export function tableFromArrays<T extends TypeMap>(
471+
input: Record<string, TypedArray | BigIntArray | readonly unknown[]>,
472+
schema: Schema<T>
473+
): Table<T>;
474+
export function tableFromArrays<I extends Record<string | number | symbol, TypedArray | BigIntArray | readonly unknown[]>>(
475+
input: I
476+
): Table<{ [P in keyof I]: ArrayDataType<I[P]> }>;
477+
export function tableFromArrays(
478+
input: Record<string, TypedArray | BigIntArray | readonly unknown[]>,
479+
schema?: Schema
480+
): Table {
481+
if (schema) {
482+
const vecs: Vector[] = [];
483+
for (const field of schema.fields) {
484+
const col = input[field.name];
485+
if (col === undefined) {
486+
throw new TypeError(
487+
`Schema field "${field.name}" not found in input. ` +
488+
`Available keys: [${Object.keys(input).join(', ')}]`
489+
);
490+
}
491+
vecs.push(vectorFromArray(col as any, field.type));
492+
}
493+
const [adjustedSchema, batches] = distributeVectorsIntoRecordBatches(schema, vecs);
494+
return new Table(adjustedSchema, batches);
495+
}
496+
const vecs = {} as Record<string, Vector>;
497+
for (const [key, col] of Object.entries(input)) {
465498
vecs[key] = vectorFromArray(col);
466499
}
467-
return new Table<T>(vecs);
500+
return new Table(vecs);
468501
}

test/unit/recordbatch/record-batch-tests.ts

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import '../../jest-extensions.js';
1919
import { arange } from '../utils.js';
2020

21-
import { RecordBatch, makeVector } from 'apache-arrow';
21+
import { RecordBatch, makeVector, recordBatchFromArrays, Schema, Field, Int32, Float32, Float64, Utf8, Dictionary } from 'apache-arrow';
2222

2323
function numsRecordBatch(i32Len: number, f32Len: number) {
2424
return new RecordBatch({
@@ -130,3 +130,98 @@ describe(`RecordBatch`, () => {
130130
});
131131
});
132132
});
133+
134+
describe(`recordBatchFromArrays()`, () => {
135+
test(`creates a RecordBatch from typed arrays and JavaScript arrays`, () => {
136+
const batch = recordBatchFromArrays({
137+
a: new Float32Array([1, 2, 3]),
138+
b: [4, 5, 6],
139+
c: ['x', 'y', 'z'],
140+
});
141+
142+
expect(batch.numRows).toBe(3);
143+
expect(batch.numCols).toBe(3);
144+
expect(batch.getChild('a')!.type).toBeInstanceOf(Float32);
145+
expect(batch.getChild('b')!.type).toBeInstanceOf(Float64);
146+
expect(batch.getChild('c')!.type).toBeInstanceOf(Dictionary);
147+
});
148+
149+
test(`schema overrides type inference`, () => {
150+
const schema = new Schema([
151+
new Field('a', new Int32),
152+
new Field('b', new Utf8),
153+
]);
154+
const batch = recordBatchFromArrays({ a: [1, 2, 3], b: ['x', 'y', 'z'] }, schema);
155+
156+
expect(batch.numRows).toBe(3);
157+
expect(batch.getChild('a')!.type).toBeInstanceOf(Int32);
158+
expect(batch.getChild('b')!.type).toBeInstanceOf(Utf8);
159+
expect(batch.getChild('a')!.toArray()).toEqual(new Int32Array([1, 2, 3]));
160+
});
161+
162+
test(`schema coerces TypedArray type`, () => {
163+
const schema = new Schema([new Field('a', new Int32)]);
164+
const batch = recordBatchFromArrays({ a: new Float32Array([1, 2, 3]) }, schema);
165+
expect(batch.getChild('a')!.type).toBeInstanceOf(Int32);
166+
expect(batch.getChild('a')!.toArray()).toEqual(new Int32Array([1, 2, 3]));
167+
});
168+
169+
test(`preserves schema metadata`, () => {
170+
const schema = new Schema(
171+
[new Field('a', new Int32)],
172+
new Map([['source', 'test']])
173+
);
174+
const batch = recordBatchFromArrays({ a: [1, 2, 3] }, schema);
175+
expect(batch.schema.metadata.get('source')).toBe('test');
176+
});
177+
178+
test(`throws on missing schema field`, () => {
179+
const schema = new Schema([new Field('c', new Int32)]);
180+
expect(() => recordBatchFromArrays({ a: [1] }, schema)).toThrow(TypeError);
181+
expect(() => recordBatchFromArrays({ a: [1] }, schema)).toThrow(/Schema field "c" not found in input/);
182+
});
183+
184+
test(`handles different length columns via ensureSameLengthData`, () => {
185+
const schema = new Schema([
186+
new Field('a', new Int32),
187+
new Field('b', new Int32),
188+
]);
189+
const batch = recordBatchFromArrays({ a: [1, 2, 3], b: [4, 5] }, schema);
190+
expect(batch.numRows).toBe(3);
191+
expect(batch.getChild('a')!).toHaveLength(3);
192+
expect(batch.getChild('b')!).toHaveLength(3);
193+
expect(batch.getChild('b')!.nullCount).toBe(1);
194+
});
195+
196+
test(`preserves field ordering from schema`, () => {
197+
const schema = new Schema([
198+
new Field('b', new Float64),
199+
new Field('a', new Int32),
200+
]);
201+
const batch = recordBatchFromArrays({ a: [1, 2, 3], b: [4.0, 5.0, 6.0] }, schema);
202+
expect(batch.schema.fields[0].name).toBe('b');
203+
expect(batch.schema.fields[1].name).toBe('a');
204+
expect(batch.getChild('b')!.type).toBeInstanceOf(Float64);
205+
expect(batch.getChild('a')!.type).toBeInstanceOf(Int32);
206+
});
207+
208+
test(`handles empty arrays`, () => {
209+
const schema = new Schema([new Field('a', new Int32)]);
210+
const batch = recordBatchFromArrays({ a: new Int32Array(0) }, schema);
211+
expect(batch.numRows).toBe(0);
212+
expect(batch.numCols).toBe(1);
213+
expect(batch.getChild('a')!.type).toBeInstanceOf(Int32);
214+
});
215+
216+
test(`basic creation without schema infers types`, () => {
217+
const batch = recordBatchFromArrays({
218+
f32: new Float32Array([1, 2]),
219+
nums: [1, 2, 3],
220+
strs: ['a', 'b'],
221+
});
222+
223+
expect(batch.getChild('f32')!.type).toBeInstanceOf(Float32);
224+
expect(batch.getChild('nums')!.type).toBeInstanceOf(Float64);
225+
expect(batch.getChild('strs')!.type).toBeInstanceOf(Dictionary);
226+
});
227+
});

0 commit comments

Comments
 (0)