diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts index 0685d262cc186..abc11eff509d9 100644 --- a/js/src/Arrow.externs.ts +++ b/js/src/Arrow.externs.ts @@ -54,6 +54,16 @@ Table.prototype.toString; Table.prototype.lengths; /** @type {?} */ Table.prototype.batches; +/** @type {?} */ +Table.prototype.countBy; +/** @type {?} */ +Table.prototype.scan; +/** @type {?} */ +Table.prototype.get; + +let CountByResult = function() {}; +/** @type {?} */ +CountByResult.prototype.asJSON; let Vector = function() {}; /** @type {?} */ diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 926ee88720bf0..21eb2976d44a4 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -import { Table, TableRow } from './table'; -import { lit, col } from './predicate'; +import { Table, TableRow, CountByResult } from './table'; +import { lit, col, Col, Value } from './predicate'; import { Vector } from './vector/vector'; import { Utf8Vector } from './vector/utf8'; import { DictionaryVector } from './vector/dictionary'; @@ -54,8 +54,8 @@ Table['fromAsync'] = Table.fromAsync; BoolVector['pack'] = BoolVector.pack; export { read, readAsync }; -export { Table, TableRow }; -export { lit, col }; +export { Table, TableRow, CountByResult }; +export { lit, col, Col, Value }; export { Vector, StructRow }; export { Uint64, Int64, Int128 }; export { NumericVectorConstructor } from './vector/numeric'; @@ -94,9 +94,11 @@ try { // string indexers tell closure compiler not to rename these properties Arrow['lit'] = lit; Arrow['col'] = col; + Arrow['Col'] = Col; Arrow['read'] = read; - Arrow['readAsync'] = readAsync; + Arrow['Value'] = Value; Arrow['Table'] = Table; + Arrow['readAsync'] = readAsync; Arrow['Vector'] = Vector; Arrow['StructRow'] = StructRow; Arrow['BoolVector'] = BoolVector; @@ -120,6 +122,7 @@ try { Arrow['Float32Vector'] = Float32Vector; Arrow['Float64Vector'] = Float64Vector; Arrow['DecimalVector'] = DecimalVector; + Arrow['CountByResult'] = CountByResult; Arrow['TimestampVector'] = TimestampVector; Arrow['DictionaryVector'] = DictionaryVector; Arrow['FixedSizeListVector'] = FixedSizeListVector; diff --git a/js/src/table.ts b/js/src/table.ts index 6f312746f2c71..f00b5ef9da1df 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -42,7 +42,7 @@ export interface DataFrame { filter(predicate: Predicate): DataFrame; scan(next: NextFunc): void; count(): number; - countBy(col: (Col|string)): Table; + countBy(col: (Col|string)): CountByResult; } function columnsFromBatches(batches: Vector[][]) { @@ -87,7 +87,7 @@ export class Table implements DataFrame { } get(idx: number): TableRow { let batch = 0; - while (idx > this.lengths[batch] && batch < this.lengths.length) { + while (idx >= this.lengths[batch] && batch < this.lengths.length) { idx -= this.lengths[batch++]; } @@ -114,7 +114,7 @@ export class Table implements DataFrame { count(): number { return this.lengths.reduce((acc, val) => acc + val); } - countBy(count_by: (Col|string)): Table { + countBy(count_by: (Col|string)): CountByResult { if (count_by instanceof String) { count_by = new Col(count_by); } @@ -146,7 +146,7 @@ export class Table implements DataFrame { } } - return new Table({batches: [[keys, new Uint32Vector({data: counts})]]}) + return new CountByResult(keys, new Uint32Vector({data: counts})) } *[Symbol.iterator]() { for (let batch = -1; ++batch < this.lengths.length;) { @@ -215,7 +215,7 @@ class FilteredDataFrame implements DataFrame { ); } - countBy(count_by: (Col|string)): Table { + countBy(count_by: (Col|string)): CountByResult { if (count_by instanceof String) { count_by = new Col(count_by); } @@ -246,6 +246,22 @@ class FilteredDataFrame implements DataFrame { } } - return new Table({batches: [[keys, new Uint32Vector({data: counts})]]}) + return new CountByResult(keys, new Uint32Vector({data: counts})) + } +} + +export class CountByResult extends Table implements DataFrame { + constructor(readonly keys: Vector, readonly counts: Vector) { + super({batches: [[keys, counts]]}); + } + + asJSON(): Object { + let result: {[key: string]: number|null} = {}; + + for (let i = -1; ++i < this.length;) { + result[this.keys.get(i)] = this.counts.get(i); + } + + return result; } } diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts new file mode 100644 index 0000000000000..33fb2d178b0d2 --- /dev/null +++ b/js/test/unit/table-tests.ts @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Arrow, { +} from '../Arrow'; + +const { + col, + Table, +} = Arrow; + +describe(`Table`, () => { + describe(`single record batch`, () => { + const table = Table.from({ + "schema": { + "fields": [ + { + "name": "f32", + "type": { + "name": "floatingpoint", + "precision": "SINGLE" + }, + "nullable": false, + "children": [], + }, + { + "name": "i32", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": false, + "children": [], + }, + { + "name": "dictionary", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [], + "dictionary": { + "id": 0, + "indexType": { + "name": "int", + "isSigned": true, + "bitWidth": 8 + }, + "isOrdered": false + } + } + ] + }, + "dictionaries": [{ + "id": 0, + "data": { + "count": 3, + "columns": [ + { + "name": "DICT0", + "count": 3, + "VALIDITY": [], + "OFFSET": [ + 0, + 1, + 2, + 3 + ], + "DATA": [ + "a", + "b", + "c", + ] + } + ] + } + }], + "batches": [{ + "count": 7, + "columns": [ + { + "name": "f32", + "count": 7, + "VALIDITY": [], + "DATA": [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] + }, + { + "name": "i32", + "count": 7, + "VALIDITY": [], + "DATA": [-1, 1, -1, 1, -1, 1, -1] + }, + { + "name": "dictionary", + "count": 7, + "VALIDITY": [], + "DATA": [0, 1, 2, 0, 1, 2, 0] + } + ] + }] + }); + + // Wrap floating point values in a Float32Array and take them back out to + // make sure that equality checks will pass + const values = [ + [new Float32Array([-0.3])[0], -1, 'a'], + [new Float32Array([-0.2])[0], 1, 'b'], + [new Float32Array([-0.1])[0], -1, 'c'], + [new Float32Array([ 0 ])[0], 1, 'a'], + [new Float32Array([ 0.1])[0], -1, 'b'], + [new Float32Array([ 0.2])[0], 1, 'c'], + [new Float32Array([ 0.3])[0], -1, 'a'] + ] + test(`has the correct length`, () => { + expect(table.length).toEqual(values.length); + }); + test(`gets expected values`, () => { + for (let i = -1; ++i < values.length;) { + expect(table.get(i).toArray()).toEqual(values[i]); + } + }); + test(`iterates expected values`, () => { + let i = 0; + for (let row of table) { + expect(row.toArray()).toEqual(values[i++]); + } + }); + test(`scans expected values`, () => { + let expected_idx = 0; + table.scan((idx, cols) => { + expect(cols.map((c)=>c.get(idx))).toEqual(values[expected_idx++]); + }); + }); + test(`count() returns the correct length`, () => { + expect(table.count()).toEqual(values.length); + }); + test(`filter on f32 >= 0 returns the correct length`, () => { + expect(table.filter(col('f32').gteq(0)).count()).toEqual(4); + }); + test(`filter on i32 <= 0 returns the correct length`, () => { + expect(table.filter(col('i32').lteq(0)).count()).toEqual(4); + }); + test(`filter on dictionary == 'a' returns the correct length`, () => { + expect(table.filter(col('dictionary').eq('a')).count()).toEqual(3); + }); + test(`countBy on dictionary returns the correct counts`, () => { + expect(table.countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 3, + 'b': 2, + 'c': 2, + }); + }); + test(`countBy on dictionary with filter returns the correct counts`, () => { + expect(table.filter(col('i32').eq(1)).countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 1, + 'b': 1, + 'c': 1, + }); + }); + }); + describe(`multiple record batches`, () => { + const table = Table.from({ + "schema": { + "fields": [ + { + "name": "f32", + "type": { + "name": "floatingpoint", + "precision": "SINGLE" + }, + "nullable": false, + "children": [], + }, + { + "name": "i32", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": false, + "children": [], + }, + { + "name": "dictionary", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [], + "dictionary": { + "id": 0, + "indexType": { + "name": "int", + "isSigned": true, + "bitWidth": 8 + }, + "isOrdered": false + } + } + ] + }, + "dictionaries": [{ + "id": 0, + "data": { + "count": 3, + "columns": [ + { + "name": "DICT0", + "count": 3, + "VALIDITY": [], + "OFFSET": [ + 0, + 1, + 2, + 3 + ], + "DATA": [ + "a", + "b", + "c", + ] + } + ] + } + }], + "batches": [{ + "count": 3, + "columns": [ + { + "name": "f32", + "count": 3, + "VALIDITY": [], + "DATA": [-0.3, -0.2, -0.1] + }, + { + "name": "i32", + "count": 3, + "VALIDITY": [], + "DATA": [-1, 1, -1] + }, + { + "name": "dictionary", + "count": 3, + "VALIDITY": [], + "DATA": [0, 1, 2] + } + ] + }, { + "count": 3, + "columns": [ + { + "name": "f32", + "count": 3, + "VALIDITY": [], + "DATA": [0, 0.1, 0.2] + }, + { + "name": "i32", + "count": 3, + "VALIDITY": [], + "DATA": [1, -1, 1] + }, + { + "name": "dictionary", + "count": 3, + "VALIDITY": [], + "DATA": [0, 1, 2] + } + ] + }, { + "count": 3, + "columns": [ + { + "name": "f32", + "count": 3, + "VALIDITY": [], + "DATA": [0.3, 0.2, 0.1] + }, + { + "name": "i32", + "count": 3, + "VALIDITY": [], + "DATA": [-1, 1, -1] + }, + { + "name": "dictionary", + "count": 3, + "VALIDITY": [], + "DATA": [0, 1, 2] + } + ] + }] + }); + + // Wrap floating point values in a Float32Array and take them back out to + // make sure that equality checks will pass + const values = [ + [new Float32Array([-0.3])[0], -1, 'a'], + [new Float32Array([-0.2])[0], 1, 'b'], + [new Float32Array([-0.1])[0], -1, 'c'], + [new Float32Array([ 0 ])[0], 1, 'a'], + [new Float32Array([ 0.1])[0], -1, 'b'], + [new Float32Array([ 0.2])[0], 1, 'c'], + [new Float32Array([ 0.3])[0], -1, 'a'], + [new Float32Array([ 0.2])[0], 1, 'b'], + [new Float32Array([ 0.1])[0], -1, 'c'], + ] + test(`has the correct length`, () => { + expect(table.length).toEqual(values.length); + }); + test(`gets expected values`, () => { + for (let i = -1; ++i < values.length;) { + expect(table.get(i).toArray()).toEqual(values[i]); + } + }); + test(`iterates expected values`, () => { + let i = 0; + for (let row of table) { + expect(row.toArray()).toEqual(values[i++]); + } + }); + test(`scans expected values`, () => { + let expected_idx = 0; + table.scan((idx, cols) => { + expect(cols.map((c)=>c.get(idx))).toEqual(values[expected_idx++]); + }); + }); + test(`count() returns the correct length`, () => { + expect(table.count()).toEqual(values.length); + }); + test(`filter on f32 >= 0 returns the correct length`, () => { + expect(table.filter(col('f32').gteq(0)).count()).toEqual(6); + }); + test(`filter on i32 <= 0 returns the correct length`, () => { + expect(table.filter(col('i32').lteq(0)).count()).toEqual(5); + }); + test(`filter on dictionary == 'a' returns the correct length`, () => { + expect(table.filter(col('dictionary').eq('a')).count()).toEqual(3); + }); + test(`countBy on dictionary returns the correct counts`, () => { + expect(table.countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 3, + 'b': 3, + 'c': 3, + }); + }); + test(`countBy on dictionary with filter returns the correct counts`, () => { + expect(table.filter(col('i32').eq(1)).countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 1, + 'b': 2, + 'c': 1, + }); + }); + }); +});