Skip to content

Commit 3eaceec

Browse files
trxcllntwesm
authored andcommitted
ARROW-6317: [JS] Implement IPC message format alignment changes
Implements the IPC message format alignment changes for [ARROW-6313](https://issues.apache.org/jira/browse/ARROW-6313). In this PR the `MessageReader` can read messages with the old alignment, but `RecordBatchWriter` always produces messages with the new alignment. I can add a flag if others think it'd be useful to produce messages in the old format. Closes apache#5225 from trxcllnt/ARROW-6314 and squashes the following commits: b69bc0e <ptaylor> update ipc reader and writer for ARROW-6313 Authored-by: ptaylor <paul.e.taylor@me.com> Signed-off-by: Wes McKinney <wesm+git@apache.org>
1 parent 7f4d50d commit 3eaceec

3 files changed

Lines changed: 69 additions & 25 deletions

File tree

js/src/ipc/message.ts

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ export class MessageReader implements IterableIterator<Message> {
4040
public next(): IteratorResult<Message> {
4141
let r;
4242
if ((r = this.readMetadataLength()).done) { return ITERATOR_DONE; }
43+
// ARROW-6313: If the first 4 bytes are continuation indicator (-1), read
44+
// the next 4 for the 32-bit metadata length. Otherwise, assume this is a
45+
// pre-v0.15 message, where the first 4 bytes are the metadata length.
46+
if ((r.value === -1) &&
47+
(r = this.readMetadataLength()).done) { return ITERATOR_DONE; }
4348
if ((r = this.readMetadata(r.value)).done) { return ITERATOR_DONE; }
4449
return (<any> r) as IteratorResult<Message>;
4550
}
@@ -76,8 +81,8 @@ export class MessageReader implements IterableIterator<Message> {
7681
protected readMetadataLength(): IteratorResult<number> {
7782
const buf = this.source.read(PADDING);
7883
const bb = buf && new ByteBuffer(buf);
79-
const len = +(bb && bb.readInt32(0))!;
80-
return { done: len <= 0, value: len };
84+
const len = bb && bb.readInt32(0) || 0;
85+
return { done: len === 0, value: len };
8186
}
8287
protected readMetadata(metadataLength: number): IteratorResult<Message> {
8388
const buf = this.source.read(metadataLength);
@@ -104,6 +109,11 @@ export class AsyncMessageReader implements AsyncIterableIterator<Message> {
104109
public async next(): Promise<IteratorResult<Message>> {
105110
let r;
106111
if ((r = await this.readMetadataLength()).done) { return ITERATOR_DONE; }
112+
// ARROW-6313: If the first 4 bytes are continuation indicator (-1), read
113+
// the next 4 for the 32-bit metadata length. Otherwise, assume this is a
114+
// pre-v0.15 message, where the first 4 bytes are the metadata length.
115+
if ((r.value === -1) &&
116+
(r = await this.readMetadataLength()).done) { return ITERATOR_DONE; }
107117
if ((r = await this.readMetadata(r.value)).done) { return ITERATOR_DONE; }
108118
return (<any> r) as IteratorResult<Message>;
109119
}
@@ -140,8 +150,8 @@ export class AsyncMessageReader implements AsyncIterableIterator<Message> {
140150
protected async readMetadataLength(): Promise<IteratorResult<number>> {
141151
const buf = await this.source.read(PADDING);
142152
const bb = buf && new ByteBuffer(buf);
143-
const len = +(bb && bb.readInt32(0))!;
144-
return { done: len <= 0, value: len };
153+
const len = bb && bb.readInt32(0) || 0;
154+
return { done: len === 0, value: len };
145155
}
146156
protected async readMetadata(metadataLength: number): Promise<IteratorResult<Message>> {
147157
const buf = await this.source.read(metadataLength);

js/src/ipc/writer.ts

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,21 @@ import { JSONVectorAssembler } from '../visitor/jsonvectorassembler';
3232
import { ArrayBufferViewInput, toUint8Array } from '../util/buffer';
3333
import { RecordBatch, _InternalEmptyPlaceholderRecordBatch } from '../recordbatch';
3434
import { Writable, ReadableInterop, ReadableDOMStreamOptions } from '../io/interfaces';
35-
import { isPromise, isAsyncIterable, isWritableDOMStream, isWritableNodeStream, isIterable } from '../util/compat';
35+
import { isPromise, isAsyncIterable, isWritableDOMStream, isWritableNodeStream, isIterable, isObject } from '../util/compat';
36+
37+
export interface RecordBatchStreamWriterOptions {
38+
/**
39+
*
40+
*/
41+
autoDestroy?: boolean;
42+
/**
43+
* A flag indicating whether the RecordBatchWriter should construct pre-0.15.0
44+
* encapsulated IPC Messages, which reserves 4 bytes for the Message metadata
45+
* length instead of 8.
46+
* @see https://issues.apache.org/jira/browse/ARROW-6313
47+
*/
48+
writeLegacyIpcFormat?: boolean;
49+
}
3650

3751
export class RecordBatchWriter<T extends { [key: string]: DataType } = any> extends ReadableInterop<Uint8Array> implements Writable<RecordBatch<T>> {
3852

@@ -51,14 +65,17 @@ export class RecordBatchWriter<T extends { [key: string]: DataType } = any> exte
5165
throw new Error(`"throughDOM" not available in this environment`);
5266
}
5367

54-
constructor(options?: { autoDestroy: boolean }) {
68+
constructor(options?: RecordBatchStreamWriterOptions) {
5569
super();
56-
this._autoDestroy = options && (typeof options.autoDestroy === 'boolean') ? options.autoDestroy : true;
70+
isObject(options) || (options = { autoDestroy: true, writeLegacyIpcFormat: false });
71+
this._autoDestroy = (typeof options.autoDestroy === 'boolean') ? options.autoDestroy : true;
72+
this._writeLegacyIpcFormat = (typeof options.writeLegacyIpcFormat === 'boolean') ? options.writeLegacyIpcFormat : false;
5773
}
5874

5975
protected _position = 0;
6076
protected _started = false;
6177
protected _autoDestroy: boolean;
78+
protected _writeLegacyIpcFormat: boolean;
6279
// @ts-ignore
6380
protected _sink = new AsyncByteQueue();
6481
protected _schema: Schema | null = null;
@@ -178,17 +195,22 @@ export class RecordBatchWriter<T extends { [key: string]: DataType } = any> exte
178195
const a = alignment - 1;
179196
const buffer = Message.encode(message);
180197
const flatbufferSize = buffer.byteLength;
181-
const alignedSize = (flatbufferSize + 4 + a) & ~a;
182-
const nPaddingBytes = alignedSize - flatbufferSize - 4;
198+
const prefixSize = !this._writeLegacyIpcFormat ? 8 : 4;
199+
const alignedSize = (flatbufferSize + prefixSize + a) & ~a;
200+
const nPaddingBytes = alignedSize - flatbufferSize - prefixSize;
183201

184202
if (message.headerType === MessageHeader.RecordBatch) {
185203
this._recordBatchBlocks.push(new FileBlock(alignedSize, message.bodyLength, this._position));
186204
} else if (message.headerType === MessageHeader.DictionaryBatch) {
187205
this._dictionaryBlocks.push(new FileBlock(alignedSize, message.bodyLength, this._position));
188206
}
189207

208+
// If not in legacy pre-0.15.0 mode, write the stream continuation indicator
209+
if (!this._writeLegacyIpcFormat) {
210+
this._write(Int32Array.of(-1));
211+
}
190212
// Write the flatbuffer size prefix including padding
191-
this._write(Int32Array.of(alignedSize - 4));
213+
this._write(Int32Array.of(alignedSize - prefixSize));
192214
// Write the flatbuffer
193215
if (flatbufferSize > 0) { this._write(buffer); }
194216
// Write any padding
@@ -212,7 +234,10 @@ export class RecordBatchWriter<T extends { [key: string]: DataType } = any> exte
212234

213235
// @ts-ignore
214236
protected _writeFooter(schema: Schema<T>) {
215-
return this._writePadding(4); // eos bytes
237+
// eos bytes
238+
return this._writeLegacyIpcFormat
239+
? this._write(Int32Array.of(0))
240+
: this._write(Int32Array.of(-1, 0));
216241
}
217242

218243
protected _writeMagic() {
@@ -275,12 +300,12 @@ export class RecordBatchWriter<T extends { [key: string]: DataType } = any> exte
275300

276301
/** @ignore */
277302
export class RecordBatchStreamWriter<T extends { [key: string]: DataType } = any> extends RecordBatchWriter<T> {
278-
public static writeAll<T extends { [key: string]: DataType } = any>(input: Table<T> | Iterable<RecordBatch<T>>, options?: { autoDestroy: true }): RecordBatchStreamWriter<T>;
279-
public static writeAll<T extends { [key: string]: DataType } = any>(input: AsyncIterable<RecordBatch<T>>, options?: { autoDestroy: true }): Promise<RecordBatchStreamWriter<T>>;
280-
public static writeAll<T extends { [key: string]: DataType } = any>(input: PromiseLike<AsyncIterable<RecordBatch<T>>>, options?: { autoDestroy: true }): Promise<RecordBatchStreamWriter<T>>;
281-
public static writeAll<T extends { [key: string]: DataType } = any>(input: PromiseLike<Table<T> | Iterable<RecordBatch<T>>>, options?: { autoDestroy: true }): Promise<RecordBatchStreamWriter<T>>;
303+
public static writeAll<T extends { [key: string]: DataType } = any>(input: Table<T> | Iterable<RecordBatch<T>>, options?: RecordBatchStreamWriterOptions): RecordBatchStreamWriter<T>;
304+
public static writeAll<T extends { [key: string]: DataType } = any>(input: AsyncIterable<RecordBatch<T>>, options?: RecordBatchStreamWriterOptions): Promise<RecordBatchStreamWriter<T>>;
305+
public static writeAll<T extends { [key: string]: DataType } = any>(input: PromiseLike<AsyncIterable<RecordBatch<T>>>, options?: RecordBatchStreamWriterOptions): Promise<RecordBatchStreamWriter<T>>;
306+
public static writeAll<T extends { [key: string]: DataType } = any>(input: PromiseLike<Table<T> | Iterable<RecordBatch<T>>>, options?: RecordBatchStreamWriterOptions): Promise<RecordBatchStreamWriter<T>>;
282307
/** @nocollapse */
283-
public static writeAll<T extends { [key: string]: DataType } = any>(input: any, options?: { autoDestroy: true }) {
308+
public static writeAll<T extends { [key: string]: DataType } = any>(input: any, options?: RecordBatchStreamWriterOptions) {
284309
const writer = new RecordBatchStreamWriter<T>(options);
285310
if (isPromise<any>(input)) {
286311
return input.then((x) => writer.writeAll(x));
@@ -323,8 +348,8 @@ export class RecordBatchFileWriter<T extends { [key: string]: DataType } = any>
323348
schema, MetadataVersion.V4,
324349
this._recordBatchBlocks, this._dictionaryBlocks
325350
));
326-
return this
327-
._writePadding(4) // EOS bytes for sequential readers
351+
return super
352+
._writeFooter(schema) // EOS bytes for sequential readers
328353
._write(buffer) // Write the flatbuffer
329354
._write(Int32Array.of(buffer.byteLength)) // then the footer size suffix
330355
._writeMagic(); // then the magic suffix
@@ -355,6 +380,8 @@ export class RecordBatchJSONWriter<T extends { [key: string]: DataType } = any>
355380
}
356381

357382
protected _writeMessage() { return this; }
383+
// @ts-ignore
384+
protected _writeFooter(schema: Schema<T>) { return this; }
358385
protected _writeSchema(schema: Schema<T>) {
359386
return this._write(`{\n "schema": ${
360387
JSON.stringify({ fields: schema.fields.map(fieldToJSON) }, null, 2)

js/test/unit/ipc/writer/stream-writer-tests.ts

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222

2323
import * as generate from '../../../generate-test-data';
2424
import { validateRecordBatchIterator } from '../validate';
25+
import { RecordBatchStreamWriterOptions } from '../../../../src/ipc/writer';
2526
import { DictionaryVector, Dictionary, Uint32, Int32 } from '../../../Arrow';
2627
import { Table, Schema, Chunked, Builder, RecordBatch, RecordBatchReader, RecordBatchStreamWriter } from '../../../Arrow';
2728

@@ -31,15 +32,21 @@ describe('RecordBatchStreamWriter', () => {
3132
const type = generate.sparseUnion(0, 0).vector.type;
3233
const schema = Schema.new({ 'dictSparseUnion': type });
3334
const table = generate.table([10, 20, 30], schema).table;
34-
testStreamWriter(table, `[${table.schema.fields.join(', ')}]`);
35+
const testName = `[${table.schema.fields.join(', ')}]`;
36+
testStreamWriter(table, testName, { writeLegacyIpcFormat: true });
37+
testStreamWriter(table, testName, { writeLegacyIpcFormat: false });
3538
})();
3639

3740
for (const table of generateRandomTables([10, 20, 30])) {
38-
testStreamWriter(table, `[${table.schema.fields.join(', ')}]`);
41+
const testName = `[${table.schema.fields.join(', ')}]`;
42+
testStreamWriter(table, testName, { writeLegacyIpcFormat: true });
43+
testStreamWriter(table, testName, { writeLegacyIpcFormat: false });
3944
}
4045

4146
for (const table of generateDictionaryTables([10, 20, 30])) {
42-
testStreamWriter(table, `${table.schema.fields[0]}`);
47+
const testName = `${table.schema.fields[0]}`;
48+
testStreamWriter(table, testName, { writeLegacyIpcFormat: true });
49+
testStreamWriter(table, testName, { writeLegacyIpcFormat: false });
4350
}
4451

4552
it(`should write multiple tables to the same output stream`, async () => {
@@ -98,14 +105,14 @@ describe('RecordBatchStreamWriter', () => {
98105
});
99106
});
100107

101-
function testStreamWriter(table: Table, name: string) {
108+
function testStreamWriter(table: Table, name: string, options: RecordBatchStreamWriterOptions) {
102109
describe(`should write the Arrow IPC stream format (${name})`, () => {
103-
test(`Table`, validateTable.bind(0, table));
110+
test(`Table`, validateTable.bind(0, table, options));
104111
});
105112
}
106113

107-
async function validateTable(source: Table) {
108-
const writer = RecordBatchStreamWriter.writeAll(source);
114+
async function validateTable(source: Table, options: RecordBatchStreamWriterOptions) {
115+
const writer = RecordBatchStreamWriter.writeAll(source, options);
109116
const result = await Table.from(writer.toUint8Array());
110117
validateRecordBatchIterator(3, source.chunks);
111118
expect(result).toEqualTable(source);

0 commit comments

Comments
 (0)