Skip to content

Commit f5df773

Browse files
trxcllntTheNeuralBit
authored andcommitted
ARROW-4682: [JS] Fix writing empty tables
Closes https://issues.apache.org/jira/browse/ARROW-4682 Author: ptaylor <paul.e.taylor@me.com> Closes apache#3759 from trxcllnt/js/fix-write-empty-table and squashes the following commits: cae6622 <ptaylor> fix lint bfc9015 <ptaylor> ensure schema metadata is preserved through assign 4898fbd <ptaylor> assign should compare column names, not fields 946a271 <ptaylor> fix assigning to empty tables 82192ae <ptaylor> add options to print horizontal table separator, schema metadata ea1c8db <ptaylor> fix writing empty tables
1 parent 0ff776f commit f5df773

9 files changed

Lines changed: 217 additions & 61 deletions

File tree

js/src/bin/arrow2csv.ts

Lines changed: 111 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,24 @@ import * as fs from 'fs';
2323
import * as stream from 'stream';
2424
import { valueToString } from '../util/pretty';
2525
import { RecordBatch, RecordBatchReader, AsyncByteQueue } from '../Arrow.node';
26+
import { Schema } from '../schema';
2627

2728
const padLeft = require('pad-left');
2829
const bignumJSONParse = require('json-bignum').parse;
2930
const pipeline = require('util').promisify(stream.pipeline);
3031
const argv = require(`command-line-args`)(cliOpts(), { partial: true });
3132
const files = argv.help ? [] : [...(argv.file || []), ...(argv._unknown || [])].filter(Boolean);
3233

33-
const state = { ...argv, closed: false, hasRecords: false };
34+
const state = { ...argv, closed: false, maxColWidths: [10] };
35+
36+
type ToStringState = {
37+
hr: string;
38+
sep: string;
39+
schema: any;
40+
closed: boolean;
41+
metadata: boolean;
42+
maxColWidths: number[];
43+
};
3444

3545
(async () => {
3646

@@ -40,20 +50,22 @@ const state = { ...argv, closed: false, hasRecords: false };
4050
].filter(Boolean) as (() => NodeJS.ReadableStream)[];
4151

4252
let reader: RecordBatchReader | null;
53+
let hasReaders = false;
4354

4455
for (const source of sources) {
4556
if (state.closed) { break; }
4657
for await (reader of recordBatchReaders(source)) {
58+
hasReaders = true;
4759
const source = reader.toNodeStream();
48-
const xform = batchesToString(state);
60+
const xform = batchesToString(state, reader.schema);
4961
const sink = new stream.PassThrough();
5062
sink.pipe(process.stdout, { end: false });
5163
await pipeline(source, xform, sink).catch(() => state.closed = true);
5264
}
5365
if (state.closed) { break; }
5466
}
5567

56-
return state.hasRecords ? 0 : print_usage();
68+
return hasReaders ? 0 : print_usage();
5769
})()
5870
.then((x) => +x || 0, (err) => {
5971
if (err) {
@@ -93,44 +105,96 @@ async function *recordBatchReaders(createSourceStream: () => NodeJS.ReadableStre
93105
}
94106
}
95107

96-
function batchesToString(state: { closed: boolean, schema: any, separator: string, hasRecords: boolean }) {
108+
function batchesToString(state: ToStringState, schema: Schema) {
109+
110+
let rowId = 0;
111+
let batchId = -1;
112+
let maxColWidths = [10];
113+
const { hr, sep } = state;
97114

98-
let rowId = 0, maxColWidths = [15], separator = `${state.separator || ' |'} `;
115+
const header = ['row_id', ...schema.fields.map((f) => `${f}`)].map(valueToString);
99116

100-
return new stream.Transform({ transform, encoding: 'utf8', writableObjectMode: true, readableObjectMode: false });
117+
state.maxColWidths = header.map((x, i) => Math.max(maxColWidths[i] || 0, x.length));
118+
119+
return new stream.Transform({
120+
transform,
121+
encoding: 'utf8',
122+
writableObjectMode: true,
123+
readableObjectMode: false,
124+
final(this: stream.Transform, cb: (error?: Error | null) => void) {
125+
// if there were no batches, then print the Schema, and metadata
126+
if (batchId === -1) {
127+
this.push(`${horizontalRule(state.maxColWidths, hr, sep)}\n\n`);
128+
this.push(`${formatRow(header, maxColWidths, sep)}\n`);
129+
if (state.metadata && schema.metadata.size > 0) {
130+
this.push(`metadata:\n${formatMetadata(schema.metadata)}\n`);
131+
}
132+
}
133+
this.push(`${horizontalRule(state.maxColWidths, hr, sep)}\n\n`);
134+
cb();
135+
}
136+
});
101137

102138
function transform(this: stream.Transform, batch: RecordBatch, _enc: string, cb: (error?: Error, data?: any) => void) {
139+
103140
batch = !(state.schema && state.schema.length) ? batch : batch.select(...state.schema);
104-
if (batch.length <= 0 || batch.numCols <= 0 || state.closed) {
105-
state.hasRecords || (state.hasRecords = false);
106-
return cb(undefined, null);
107-
}
108141

109-
state.hasRecords = true;
110-
const header = ['row_id', ...batch.schema.fields.map((f) => `${f}`)].map(valueToString);
142+
if (state.closed) { return cb(undefined, null); }
111143

112144
// Pass one to convert to strings and count max column widths
113-
const newMaxWidths = measureColumnWidths(rowId, batch, header.map((x, i) => Math.max(maxColWidths[i] || 0, x.length)));
145+
state.maxColWidths = measureColumnWidths(rowId, batch, header.map((x, i) => Math.max(maxColWidths[i] || 0, x.length)));
114146

115-
// If any of the column widths changed, print the header again
116-
if ((rowId % 350) && JSON.stringify(newMaxWidths) !== JSON.stringify(maxColWidths)) {
117-
this.push(`\n${formatRow(header, newMaxWidths, separator)}`);
147+
// If this is the first batch in a stream, print a top horizontal rule, schema metadata, and
148+
if (++batchId === 0) {
149+
this.push(`${horizontalRule(state.maxColWidths, hr, sep)}\n`);
150+
if (state.metadata && batch.schema.metadata.size > 0) {
151+
this.push(`metadata:\n${formatMetadata(batch.schema.metadata)}\n`);
152+
this.push(`${horizontalRule(state.maxColWidths, hr, sep)}\n`);
153+
}
154+
if (batch.length <= 0 || batch.numCols <= 0) {
155+
this.push(`${formatRow(header, maxColWidths = state.maxColWidths, sep)}\n`);
156+
}
118157
}
119158

120-
maxColWidths = newMaxWidths;
121-
122-
for (const row of batch) {
123-
if (state.closed) { break; }
124-
else if (!row) { continue; }
125-
if (!(rowId % 350)) { this.push(`\n${formatRow(header, maxColWidths, separator)}`); }
126-
this.push(formatRow([rowId++, ...row].map(valueToString), maxColWidths, separator));
159+
if (batch.length > 0 && batch.numCols > 0) {
160+
// If any of the column widths changed, print the header again
161+
if (rowId % 350 !== 0 && JSON.stringify(state.maxColWidths) !== JSON.stringify(maxColWidths)) {
162+
this.push(`${formatRow(header, state.maxColWidths, sep)}\n`);
163+
}
164+
maxColWidths = state.maxColWidths;
165+
for (const row of batch) {
166+
if (state.closed) { break; } else if (!row) { continue; }
167+
if (rowId++ % 350 === 0) {
168+
this.push(`${formatRow(header, maxColWidths, sep)}\n`);
169+
}
170+
this.push(`${formatRow([rowId, ...row].map(valueToString), maxColWidths, sep)}\n`);
171+
}
127172
}
128173
cb();
129174
}
130175
}
131176

132-
function formatRow(row: string[] = [], maxColWidths: number[] = [], separator: string = ' |') {
133-
return row.map((x, j) => padLeft(x, maxColWidths[j])).join(separator) + '\n';
177+
function horizontalRule(maxColWidths: number[], hr = '-', sep = ' |') {
178+
return ` ${padLeft('', maxColWidths.reduce((x, y) => x + y, -2 + maxColWidths.length * sep.length), hr)}`;
179+
}
180+
181+
function formatRow(row: string[] = [], maxColWidths: number[] = [], sep = ' |') {
182+
return `${row.map((x, j) => padLeft(x, maxColWidths[j])).join(sep)}`;
183+
}
184+
185+
function formatMetadata(metadata: Map<string, string>) {
186+
187+
return [...metadata].map(([key, val]) =>
188+
` ${key}: ${formatMetadataValue(val)}`
189+
).join(', \n');
190+
191+
function formatMetadataValue(value: string = '') {
192+
let parsed = value;
193+
try {
194+
parsed = JSON.stringify(JSON.parse(value), null, 2);
195+
} catch (e) { parsed = value; }
196+
return valueToString(parsed).split('\n').join('\n ');
197+
}
134198
}
135199
136200
function measureColumnWidths(rowId: number, batch: RecordBatch, maxColWidths: number[] = []) {
@@ -201,8 +265,19 @@ function cliOpts() {
201265
},
202266
{
203267
type: String,
204-
name: 'sep', optional: true, default: '|',
205-
description: 'The column separator character'
268+
name: 'sep', optional: true, default: ' |',
269+
description: 'The column separator character (default: " |")'
270+
},
271+
{
272+
type: String,
273+
name: 'hr', optional: true, default: '-',
274+
description: 'The horizontal border character (default: "-")'
275+
},
276+
{
277+
type: Boolean,
278+
name: 'metadata', alias: 'm',
279+
optional: true, default: false,
280+
description: 'Flag to print Schema metadata (default: false)'
206281
},
207282
{
208283
type: Boolean,
@@ -234,14 +309,15 @@ function print_usage() {
234309
{
235310
header: 'Example',
236311
content: [
237-
'$ arrow2csv --schema foo baz -f simple.arrow --sep ","',
238-
' ',
239-
'> "row_id", "foo: Int32", "bar: Float64", "baz: Utf8"',
240-
'> 0, 1, 1, "aa"',
241-
'> 1, null, null, null',
242-
'> 2, 3, null, null',
243-
'> 3, 4, 4, "bbb"',
244-
'> 4, 5, 5, "cccc"',
312+
'$ arrow2csv --schema foo baz --sep "," -f simple.arrow',
313+
'>--------------------------------------',
314+
'> "row_id", "foo: Int32", "baz: Utf8"',
315+
'> 0, 1, "aa"',
316+
'> 1, null, null',
317+
'> 2, 3, null',
318+
'> 3, 4, "bbb"',
319+
'> 4, 5, "cccc"',
320+
'>--------------------------------------',
245321
]
246322
}
247323
]));

js/src/ipc/writer.ts

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import { JSONTypeAssembler } from '../visitor/jsontypeassembler';
3333
import { JSONVectorAssembler } from '../visitor/jsonvectorassembler';
3434
import { ArrayBufferViewInput, toUint8Array } from '../util/buffer';
3535
import { Writable, ReadableInterop, ReadableDOMStreamOptions } from '../io/interfaces';
36-
import { isPromise, isAsyncIterable, isWritableDOMStream, isWritableNodeStream } from '../util/compat';
36+
import { isPromise, isAsyncIterable, isWritableDOMStream, isWritableNodeStream, isIterable } from '../util/compat';
3737

3838
export class RecordBatchWriter<T extends { [key: string]: DataType } = any> extends ReadableInterop<Uint8Array> implements Writable<RecordBatch<T>> {
3939

@@ -140,21 +140,34 @@ export class RecordBatchWriter<T extends { [key: string]: DataType } = any> exte
140140
return this;
141141
}
142142

143-
public write(chunk?: Table<T> | RecordBatch<T> | null) {
144-
let schema: Schema<T> | null;
143+
public write(payload?: Table<T> | RecordBatch<T> | Iterable<RecordBatch<T>> | null) {
144+
145+
let schema: Schema<T> | null = null;
146+
145147
if (!this._sink) {
146148
throw new Error(`RecordBatchWriter is closed`);
147-
} else if (!chunk || !(schema = chunk.schema)) {
149+
} else if (payload === null || payload === undefined) {
150+
return this.finish() && undefined;
151+
} else if (payload instanceof Table && !(schema = payload.schema)) {
148152
return this.finish() && undefined;
149-
} else if (schema !== this._schema) {
153+
} else if (payload instanceof RecordBatch && !(schema = payload.schema)) {
154+
return this.finish() && undefined;
155+
}
156+
157+
if (schema && !schema.compareTo(this._schema)) {
150158
if (this._started && this._autoDestroy) {
151159
return this.close();
152160
}
153161
this.reset(this._sink, schema);
154162
}
155-
(chunk instanceof Table)
156-
? this.writeAll(chunk.chunks)
157-
: this._writeRecordBatch(chunk);
163+
164+
if (payload instanceof RecordBatch) {
165+
this._writeRecordBatch(payload);
166+
} else if (payload instanceof Table) {
167+
this.writeAll(payload.chunks);
168+
} else if (isIterable(payload)) {
169+
this.writeAll(payload);
170+
}
158171
}
159172

160173
protected _writeMessage<T extends MessageHeader>(message: Message<T>, alignment = 8) {
@@ -363,7 +376,11 @@ export class RecordBatchJSONWriter<T extends { [key: string]: DataType } = any>
363376

364377
/** @ignore */
365378
function writeAll<T extends { [key: string]: DataType } = any>(writer: RecordBatchWriter<T>, input: Table<T> | Iterable<RecordBatch<T>>) {
366-
const chunks = (input instanceof Table) ? input.chunks : input;
379+
let chunks = input as Iterable<RecordBatch<T>>;
380+
if (input instanceof Table) {
381+
chunks = input.chunks;
382+
writer.reset(undefined, input.schema);
383+
}
367384
for (const batch of chunks) {
368385
writer.write(batch);
369386
}

js/src/schema.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ export class Schema<T extends { [key: string]: DataType } = any> {
9292
const curDictionaryFields = this.dictionaryFields;
9393
const metadata = mergeMaps(mergeMaps(new Map(), this.metadata), other.metadata);
9494
const newFields = other.fields.filter((f2) => {
95-
const i = curFields.findIndex((f) => f.compareTo(f2));
96-
return ~i ? (curFields[i] = curFields[i].clone({
95+
const i = curFields.findIndex((f) => f.name === f2.name);
96+
return ~i ? (curFields[i] = f2.clone({
9797
metadata: mergeMaps(mergeMaps(new Map(), curFields[i].metadata), f2.metadata)
9898
})) && false : true;
9999
}) as Field[];
@@ -102,7 +102,7 @@ export class Schema<T extends { [key: string]: DataType } = any> {
102102
const newDictionaries = [...dictionaries].filter(([y]) => !curDictionaries.every(([x]) => x === y));
103103
const newDictionaryFields = [...dictionaryFields].map(([id, newDictFields]) => {
104104
return [id, [...(curDictionaryFields.get(id) || []), ...newDictFields.map((f) => {
105-
const i = newFields.findIndex((f2) => f2.compareTo(f));
105+
const i = newFields.findIndex((f2) => f.name === f2.name);
106106
const { dictionary, indices, isOrdered, dictionaryVector } = f.type;
107107
const type = new Dictionary(dictionary, indices, id, isOrdered, dictionaryVector);
108108
return newFields[i] = f.clone({ type });

js/src/table.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,9 +179,11 @@ export class Table<T extends { [key: string]: DataType } = any>
179179
throw new TypeError('Table must be initialized with a Schema or at least one RecordBatch');
180180
}
181181

182-
if (!chunks[0]) { chunks[0] = new RecordBatch(schema, 0, []); }
182+
if (!chunks[0]) {
183+
chunks[0] = new RecordBatch(schema, 0, schema.fields.map((f) => new Data(f.type, 0, 0)));
184+
}
183185

184-
super(chunks[0].type, chunks);
186+
super(new Struct<T>(schema.fields), chunks);
185187

186188
this._schema = schema;
187189
this._chunks = chunks;
@@ -252,7 +254,7 @@ export class Table<T extends { [key: string]: DataType } = any>
252254
const fields = this._schema.fields;
253255
const [indices, oldToNew] = other.schema.fields.reduce((memo, f2, newIdx) => {
254256
const [indices, oldToNew] = memo;
255-
const i = fields.findIndex((f) => f.compareTo(f2));
257+
const i = fields.findIndex((f) => f.name === f2.name);
256258
~i ? (oldToNew[i] = newIdx) : indices.push(newIdx);
257259
return memo;
258260
}, [[], []] as number[][]);

js/src/util/recordbatch.ts

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,14 @@ export function distributeVectorsIntoRecordBatches<T extends { [key: string]: Da
6565
/** @ignore */
6666
function uniformlyDistributeChunksAcrossRecordBatches<T extends { [key: string]: DataType } = any>(schema: Schema<T>, columns: Data<T[keyof T]>[][]): [Schema<T>, RecordBatch<T>[]] {
6767

68-
let numBatches = 0;
6968
const fields = [...schema.fields];
70-
const batches = [] as [number, Data<T[keyof T]>[]][];
69+
const batchArgs = [] as [number, Data<T[keyof T]>[]][];
7170
const memo = { numBatches: columns.reduce((n, c) => Math.max(n, c.length), 0) };
71+
let sameLength = false, numBatches = 0, batchLength = 0, batchData: Data<T[keyof T]>[];
7272

73-
while (memo.numBatches > 0) {
73+
while (memo.numBatches-- > 0) {
7474

75-
const [sameLength, batchLength] = columns.reduce((memo, [chunk]) => {
75+
[sameLength, batchLength] = columns.reduce((memo, [chunk]) => {
7676
const [same, batchLength] = memo;
7777
const chunkLength = chunk ? chunk.length : batchLength;
7878
isFinite(batchLength) && same && (memo[0] = chunkLength === batchLength);
@@ -81,18 +81,18 @@ function uniformlyDistributeChunksAcrossRecordBatches<T extends { [key: string]:
8181
}, [true, Number.POSITIVE_INFINITY] as [boolean, number]);
8282

8383
if (isFinite(batchLength) && !(sameLength && batchLength <= 0)) {
84-
batches[numBatches++] = [batchLength, distributeChildData(fields, batchLength, columns, memo)];
84+
batchData = distributeChildData(fields, batchLength, columns, memo);
85+
batchLength > 0 && (batchArgs[numBatches++] = [batchLength, batchData]);
8586
}
8687
}
8788
return [
88-
schema = new Schema<T>(fields),
89-
batches.map((xs) => new RecordBatch(schema, ...xs))
89+
schema = new Schema<T>(fields, schema.metadata),
90+
batchArgs.map((xs) => new RecordBatch(schema, ...xs))
9091
];
9192
}
9293

9394
/** @ignore */
9495
function distributeChildData<T extends { [key: string]: DataType } = any>(fields: Field<T[keyof T]>[], batchLength: number, columns: Data<T[keyof T]>[][], memo: { numBatches: number }) {
95-
memo.numBatches -= 1;
9696
let data: Data<T[keyof T]>;
9797
let field: Field<T[keyof T]>;
9898
let chunks: Data<T[keyof T]>[];

js/src/visitor/vectorassembler.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,10 @@ export class VectorAssembler extends Visitor {
5959

6060
/** @nocollapse */
6161
public static assemble<T extends Vector | RecordBatch>(...args: (T | T[])[]) {
62-
return new VectorAssembler().visitMany(selectVectorChildrenArgs(RecordBatch, args))[0];
62+
const assembler = new VectorAssembler();
63+
const vectorChildren = selectVectorChildrenArgs(RecordBatch, args);
64+
const [assembleResult = assembler] = assembler.visitMany(vectorChildren);
65+
return assembleResult;
6366
}
6467

6568
private constructor() { super(); }

js/test/generate-test-data.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ export type GeneratedVector<TVec extends Vector = Vector> = {
152152
values: () => (TVec['TValue'] | null)[];
153153
};
154154

155-
export const table = (lengths = [100], schema: Schema = new Schema(defaultRecordBatchChildren.slice())): GeneratedTable => {
155+
export const table = (lengths = [100], schema: Schema = new Schema(defaultRecordBatchChildren.slice(), new Map([['foo', 'bar']]))): GeneratedTable => {
156156
const generated = lengths.map((length) => recordBatch(length, schema));
157157
const rowBatches = generated.map(({ rows }) => rows);
158158
const colBatches = generated.map(({ cols }) => cols);

js/test/jest-extensions.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ function toEqualTable(this: jest.MatcherUtils, actual: Table, expected: Table) {
5555
const failures = [] as string[];
5656
try { expect(actual.length).toEqual(expected.length); } catch (e) { failures.push(`${e}`); }
5757
try { expect(actual.numCols).toEqual(expected.numCols); } catch (e) { failures.push(`${e}`); }
58+
try { expect(actual.schema.metadata).toEqual(expected.schema.metadata); } catch (e) { failures.push(`${e}`); }
5859
(() => {
5960
for (let i = -1, n = actual.numCols; ++i < n;) {
6061
const v1 = actual.getColumnAt(i);

0 commit comments

Comments
 (0)