Skip to content

Commit b1eecc4

Browse files
committed
ARROW-12703: [JS] Separate Table from DataFrame
Changes: * **Breaking:** Tables are not DataFrames anymore. You can construct `DataFrame`s just like `Table`s. Closes apache#10277 from domoritz/dataframe Authored-by: Dominik Moritz <domoritz@gmail.com> Signed-off-by: Dominik Moritz <domoritz@gmail.com>
1 parent 5c82111 commit b1eecc4

4 files changed

Lines changed: 318 additions & 262 deletions

File tree

js/src/compute/dataframe.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,16 @@ export type BindFunc = (batch: RecordBatch) => void;
2929
/** @ignore */
3030
export type NextFunc = (idx: number, batch: RecordBatch) => void;
3131

32-
Table.prototype.countBy = function(this: Table, name: Col | string) { return new DataFrame(this.chunks).countBy(name); };
33-
Table.prototype.scan = function(this: Table, next: NextFunc, bind?: BindFunc) { return new DataFrame(this.chunks).scan(next, bind); };
34-
Table.prototype.scanReverse = function(this: Table, next: NextFunc, bind?: BindFunc) { return new DataFrame(this.chunks).scanReverse(next, bind); };
35-
Table.prototype.filter = function(this: Table, predicate: Predicate): FilteredDataFrame { return new DataFrame(this.chunks).filter(predicate); };
36-
32+
/**
33+
* `DataFrame` extends {@link Table} with support for predicate filtering.
34+
*
35+
* You can construct `DataFrames` like tables or convert a `Table` to a `DataFrame`
36+
* with the constructor.
37+
*
38+
* ```ts
39+
* const df = new DataFrame(table);
40+
* ```
41+
*/
3742
export class DataFrame<T extends { [key: string]: DataType } = any> extends Table<T> {
3843
public filter(predicate: Predicate): FilteredDataFrame<T> {
3944
return new FilteredDataFrame<T>(this.chunks, predicate);

js/src/table.ts

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,19 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
import { Data } from './data';
1918
import { Column } from './column';
20-
import { Schema, Field } from './schema';
21-
import { RecordBatch, _InternalEmptyPlaceholderRecordBatch } from './recordbatch';
22-
import { DataFrame } from './compute/dataframe';
19+
import { Data } from './data';
20+
import { TypedArray, TypedArrayDataType } from './interfaces';
2321
import { RecordBatchReader } from './ipc/reader';
24-
import { DataType, RowLike, Struct } from './type';
25-
import { selectColumnArgs, selectArgs } from './util/args';
26-
import { Clonable, Sliceable, Applicative } from './vector';
27-
import { isPromise, isIterable, isAsyncIterable } from './util/compat';
2822
import { RecordBatchFileWriter, RecordBatchStreamWriter } from './ipc/writer';
23+
import { RecordBatch, _InternalEmptyPlaceholderRecordBatch } from './recordbatch';
24+
import { Field, Schema } from './schema';
25+
import { DataType, RowLike, Struct } from './type';
26+
import { selectArgs, selectColumnArgs } from './util/args';
27+
import { isAsyncIterable, isIterable, isPromise } from './util/compat';
2928
import { distributeColumnsIntoRecordBatches, distributeVectorsIntoRecordBatches } from './util/recordbatch';
30-
import { Vector, Chunked, StructVector, VectorBuilderOptions, VectorBuilderOptionsAsync } from './vector/index';
31-
import { TypedArray, TypedArrayDataType } from './interfaces';
29+
import { Applicative, Clonable, Sliceable } from './vector';
30+
import { Chunked, StructVector, Vector, VectorBuilderOptions, VectorBuilderOptionsAsync } from './vector/index';
3231

3332
type VectorMap = { [key: string]: Vector | Exclude<TypedArray, Uint8ClampedArray> };
3433
type Fields<T extends { [key: string]: DataType }> = (keyof T)[] | Field<T[keyof T]>[];
@@ -43,17 +42,11 @@ export interface Table<T extends { [key: string]: DataType } = any> {
4342
slice(begin?: number, end?: number): Table<T>;
4443
concat(...others: Vector<Struct<T>>[]): Table<T>;
4544
clone(chunks?: RecordBatch<T>[], offsets?: Uint32Array): Table<T>;
46-
47-
scan(next: import('./compute/dataframe').NextFunc, bind?: import('./compute/dataframe').BindFunc): void;
48-
scanReverse(next: import('./compute/dataframe').NextFunc, bind?: import('./compute/dataframe').BindFunc): void;
49-
countBy(name: import('./compute/predicate').Col | string): import('./compute/dataframe').CountByResult;
50-
filter(predicate: import('./compute/predicate').Predicate): import('./compute/dataframe').FilteredDataFrame<T>;
5145
}
5246

5347
export class Table<T extends { [key: string]: DataType } = any>
5448
extends Chunked<Struct<T>>
55-
implements DataFrame<T>,
56-
Clonable<Table<T>>,
49+
implements Clonable<Table<T>>,
5750
Sliceable<Table<T>>,
5851
Applicative<Struct<T>, Table<T>> {
5952

@@ -173,6 +166,7 @@ export class Table<T extends { [key: string]: DataType } = any>
173166
return new Table(...distributeColumnsIntoRecordBatches(selectColumnArgs(cols)));
174167
}
175168

169+
constructor(table: Table<T>);
176170
constructor(batches: RecordBatch<T>[]);
177171
constructor(...batches: RecordBatch<T>[]);
178172
constructor(schema: Schema<T>, batches: RecordBatch<T>[]);
@@ -181,9 +175,9 @@ export class Table<T extends { [key: string]: DataType } = any>
181175

182176
let schema: Schema<T> = null!;
183177

184-
if (args[0] instanceof Schema) { schema = args.shift(); }
178+
if (args[0] instanceof Schema) { schema = args[0]; }
185179

186-
const chunks = selectArgs<RecordBatch<T>>(RecordBatch, args);
180+
const chunks = args[0] instanceof Table ? (args[0] as Table<T>).chunks : selectArgs<RecordBatch<T>>(RecordBatch, args);
187181

188182
if (!schema && !(schema = chunks[0]?.schema)) {
189183
throw new TypeError('Table must be initialized with a Schema or at least one RecordBatch');

js/test/unit/dataframe-tests.ts

Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
import '../jest-extensions';
19+
import {
20+
predicate, DataFrame, RecordBatch
21+
} from '../Arrow';
22+
import { test_data } from './table-tests';
23+
24+
const { col, lit, custom, and, or, And, Or } = predicate;
25+
26+
const F32 = 0, I32 = 1, DICT = 2;
27+
28+
describe(`DataFrame`, () => {
29+
30+
for (let datum of test_data) {
31+
describe(datum.name, () => {
32+
33+
describe(`scan()`, () => {
34+
test(`yields all values`, () => {
35+
const df = new DataFrame(datum.table());
36+
let expected_idx = 0;
37+
df.scan((idx, batch) => {
38+
const columns = batch.schema.fields.map((_, i) => batch.getChildAt(i)!);
39+
expect(columns.map((c) => c.get(idx))).toEqual(values[expected_idx++]);
40+
});
41+
});
42+
test(`calls bind function with every batch`, () => {
43+
const df = new DataFrame(datum.table());
44+
let bind = jest.fn();
45+
df.scan(() => { }, bind);
46+
for (let batch of df.chunks) {
47+
expect(bind).toHaveBeenCalledWith(batch);
48+
}
49+
});
50+
});
51+
describe(`scanReverse()`, () => {
52+
test(`yields all values`, () => {
53+
const df = new DataFrame(datum.table());
54+
let expected_idx = values.length;
55+
df.scanReverse((idx, batch) => {
56+
const columns = batch.schema.fields.map((_, i) => batch.getChildAt(i)!);
57+
expect(columns.map((c) => c.get(idx))).toEqual(values[--expected_idx]);
58+
});
59+
});
60+
test(`calls bind function with every batch`, () => {
61+
const df = new DataFrame(datum.table());
62+
let bind = jest.fn();
63+
df.scanReverse(() => { }, bind);
64+
for (let batch of df.chunks) {
65+
expect(bind).toHaveBeenCalledWith(batch);
66+
}
67+
});
68+
});
69+
test(`count() returns the correct length`, () => {
70+
const df = new DataFrame(datum.table());
71+
const values = datum.values();
72+
expect(df.count()).toEqual(values.length);
73+
});
74+
test(`getColumnIndex`, () => {
75+
const df = new DataFrame(datum.table());
76+
expect(df.getColumnIndex('i32')).toEqual(I32);
77+
expect(df.getColumnIndex('f32')).toEqual(F32);
78+
expect(df.getColumnIndex('dictionary')).toEqual(DICT);
79+
});
80+
const df = new DataFrame(datum.table());
81+
const values = datum.values();
82+
let get_i32: (idx: number) => number, get_f32: (idx: number) => number;
83+
const filter_tests = [
84+
{
85+
name: `filter on f32 >= 0`,
86+
filtered: df.filter(col('f32').ge(0)),
87+
expected: values.filter((row) => row[F32] >= 0)
88+
}, {
89+
name: `filter on 0 <= f32`,
90+
filtered: df.filter(lit(0).le(col('f32'))),
91+
expected: values.filter((row) => 0 <= row[F32])
92+
}, {
93+
name: `filter on i32 <= 0`,
94+
filtered: df.filter(col('i32').le(0)),
95+
expected: values.filter((row) => row[I32] <= 0)
96+
}, {
97+
name: `filter on 0 >= i32`,
98+
filtered: df.filter(lit(0).ge(col('i32'))),
99+
expected: values.filter((row) => 0 >= row[I32])
100+
}, {
101+
name: `filter on f32 < 0`,
102+
filtered: df.filter(col('f32').lt(0)),
103+
expected: values.filter((row) => row[F32] < 0)
104+
}, {
105+
name: `filter on i32 > 1 (empty)`,
106+
filtered: df.filter(col('i32').gt(0)),
107+
expected: values.filter((row) => row[I32] > 0)
108+
}, {
109+
name: `filter on f32 <= -.25 || f3 >= .25`,
110+
filtered: df.filter(col('f32').le(-.25).or(col('f32').ge(.25))),
111+
expected: values.filter((row) => row[F32] <= -.25 || row[F32] >= .25)
112+
}, {
113+
name: `filter on !(f32 <= -.25 || f3 >= .25) (not)`,
114+
filtered: df.filter(col('f32').le(-.25).or(col('f32').ge(.25)).not()),
115+
expected: values.filter((row) => !(row[F32] <= -.25 || row[F32] >= .25))
116+
}, {
117+
name: `filter method combines predicates (f32 >= 0 && i32 <= 0)`,
118+
filtered: df.filter(col('i32').le(0)).filter(col('f32').ge(0)),
119+
expected: values.filter((row) => row[I32] <= 0 && row[F32] >= 0)
120+
}, {
121+
name: `filter on dictionary == 'a'`,
122+
filtered: df.filter(col('dictionary').eq('a')),
123+
expected: values.filter((row) => row[DICT] === 'a')
124+
}, {
125+
name: `filter on 'a' == dictionary (commutativity)`,
126+
filtered: df.filter(lit('a').eq(col('dictionary'))),
127+
expected: values.filter((row) => row[DICT] === 'a')
128+
}, {
129+
name: `filter on dictionary != 'b'`,
130+
filtered: df.filter(col('dictionary').ne('b')),
131+
expected: values.filter((row) => row[DICT] !== 'b')
132+
}, {
133+
name: `filter on f32 >= i32`,
134+
filtered: df.filter(col('f32').ge(col('i32'))),
135+
expected: values.filter((row) => row[F32] >= row[I32])
136+
}, {
137+
name: `filter on f32 <= i32`,
138+
filtered: df.filter(col('f32').le(col('i32'))),
139+
expected: values.filter((row) => row[F32] <= row[I32])
140+
}, {
141+
name: `filter on f32*i32 > 0 (custom predicate)`,
142+
filtered: df.filter(custom(
143+
(idx: number) => (get_f32(idx) * get_i32(idx) > 0),
144+
(batch: RecordBatch) => {
145+
get_f32 = col('f32').bind(batch);
146+
get_i32 = col('i32').bind(batch);
147+
})),
148+
expected: values.filter((row) => (row[F32] as number) * (row[I32] as number) > 0)
149+
}, {
150+
name: `filter out all records`,
151+
filtered: df.filter(lit(1).eq(0)),
152+
expected: []
153+
}
154+
];
155+
for (let this_test of filter_tests) {
156+
const { name, filtered, expected } = this_test;
157+
describe(name, () => {
158+
test(`count() returns the correct length`, () => {
159+
expect(filtered.count()).toEqual(expected.length);
160+
});
161+
describe(`scan()`, () => {
162+
test(`iterates over expected values`, () => {
163+
let expected_idx = 0;
164+
filtered.scan((idx, batch) => {
165+
const columns = batch.schema.fields.map((_, i) => batch.getChildAt(i)!);
166+
expect(columns.map((c) => c.get(idx))).toEqual(expected[expected_idx++]);
167+
});
168+
});
169+
test(`calls bind function lazily`, () => {
170+
let bind = jest.fn();
171+
filtered.scan(() => { }, bind);
172+
if (expected.length) {
173+
expect(bind).toHaveBeenCalled();
174+
} else {
175+
expect(bind).not.toHaveBeenCalled();
176+
}
177+
});
178+
});
179+
describe(`scanReverse()`, () => {
180+
test(`iterates over expected values in reverse`, () => {
181+
let expected_idx = expected.length;
182+
filtered.scanReverse((idx, batch) => {
183+
const columns = batch.schema.fields.map((_, i) => batch.getChildAt(i)!);
184+
expect(columns.map((c) => c.get(idx))).toEqual(expected[--expected_idx]);
185+
});
186+
});
187+
test(`calls bind function lazily`, () => {
188+
let bind = jest.fn();
189+
filtered.scanReverse(() => { }, bind);
190+
if (expected.length) {
191+
expect(bind).toHaveBeenCalled();
192+
} else {
193+
expect(bind).not.toHaveBeenCalled();
194+
}
195+
});
196+
});
197+
});
198+
}
199+
test(`countBy on dictionary returns the correct counts`, () => {
200+
// Make sure countBy works both with and without the Col wrapper
201+
// class
202+
let expected: { [key: string]: number } = { 'a': 0, 'b': 0, 'c': 0 };
203+
for (let row of values) {
204+
expected[row[DICT]] += 1;
205+
}
206+
207+
expect(df.countBy(col('dictionary')).toJSON()).toEqual(expected);
208+
expect(df.countBy('dictionary').toJSON()).toEqual(expected);
209+
});
210+
test(`countBy on dictionary with filter returns the correct counts`, () => {
211+
let expected: { [key: string]: number } = { 'a': 0, 'b': 0, 'c': 0 };
212+
for (let row of values) {
213+
if (row[I32] === 1) { expected[row[DICT]] += 1; }
214+
}
215+
216+
expect(df.filter(col('i32').eq(1)).countBy('dictionary').toJSON()).toEqual(expected);
217+
});
218+
test(`countBy on non dictionary column throws error`, () => {
219+
expect(() => { df.countBy('i32'); }).toThrow();
220+
expect(() => { df.filter(col('dict').eq('a')).countBy('i32'); }).toThrow();
221+
});
222+
test(`countBy on non-existent column throws error`, () => {
223+
expect(() => { df.countBy('FAKE' as any); }).toThrow();
224+
});
225+
test(`table.select() basic tests`, () => {
226+
let selected = df.select('f32', 'dictionary');
227+
expect(selected.schema.fields).toHaveLength(2);
228+
expect(selected.schema.fields[0]).toEqual(df.schema.fields[0]);
229+
expect(selected.schema.fields[1]).toEqual(df.schema.fields[2]);
230+
231+
expect(selected).toHaveLength(values.length);
232+
let idx = 0, expected_row;
233+
for (let row of selected) {
234+
expected_row = values[idx++];
235+
expect(row.f32).toEqual(expected_row[F32]);
236+
expect(row.dictionary).toEqual(expected_row[DICT]);
237+
}
238+
});
239+
// test(`table.toString()`, () => {
240+
// let selected = table.select('i32', 'dictionary');
241+
// let headers = [`"row_id"`, `"i32: Int32"`, `"dictionary: Dictionary<Int8, Utf8>"`];
242+
// let expected = [headers.join(' | '), ...values.map((row, idx) => {
243+
// return [`${idx}`, `${row[I32]}`, `"${row[DICT]}"`].map((str, col) => {
244+
// return leftPad(str, ' ', headers[col].length);
245+
// }).join(' | ');
246+
// })].join('\n') + '\n';
247+
// expect(selected.toString()).toEqual(expected);
248+
// });
249+
test(`table.filter(..).count() on always false predicates returns 0`, () => {
250+
expect(df.filter(col('i32').ge(100)).count()).toEqual(0);
251+
expect(df.filter(col('dictionary').eq('z')).count()).toEqual(0);
252+
});
253+
describe(`lit-lit comparison`, () => {
254+
test(`always-false count() returns 0`, () => {
255+
expect(df.filter(lit('abc').eq('def')).count()).toEqual(0);
256+
expect(df.filter(lit(0).ge(1)).count()).toEqual(0);
257+
});
258+
test(`always-true count() returns length`, () => {
259+
expect(df.filter(lit('abc').eq('abc')).count()).toEqual(df.length);
260+
expect(df.filter(lit(-100).le(0)).count()).toEqual(df.length);
261+
});
262+
});
263+
describe(`col-col comparison`, () => {
264+
test(`always-false count() returns 0`, () => {
265+
expect(df.filter(col('dictionary').eq(col('i32'))).count()).toEqual(0);
266+
});
267+
test(`always-true count() returns length`, () => {
268+
expect(df.filter(col('dictionary').eq(col('dictionary'))).count()).toEqual(df.length);
269+
});
270+
});
271+
});
272+
}
273+
});
274+
275+
describe(`Predicate`, () => {
276+
const p1 = col('a').gt(100);
277+
const p2 = col('a').lt(1000);
278+
const p3 = col('b').eq('foo');
279+
const p4 = col('c').eq('bar');
280+
const expected = [p1, p2, p3, p4];
281+
test(`and flattens children`, () => {
282+
expect(and(p1, p2, p3, p4).children).toEqual(expected);
283+
expect(and(p1.and(p2), new And(p3, p4)).children).toEqual(expected);
284+
expect(and(p1.and(p2, p3, p4)).children).toEqual(expected);
285+
});
286+
test(`or flattens children`, () => {
287+
expect(or(p1, p2, p3, p4).children).toEqual(expected);
288+
expect(or(p1.or(p2), new Or(p3, p4)).children).toEqual(expected);
289+
expect(or(p1.or(p2, p3, p4)).children).toEqual(expected);
290+
});
291+
});

0 commit comments

Comments
 (0)