diff --git a/js/gulp/uglify-task.js b/js/gulp/uglify-task.js index bcda8cae60df5..988830f31bd80 100644 --- a/js/gulp/uglify-task.js +++ b/js/gulp/uglify-task.js @@ -91,6 +91,7 @@ const reservePublicNames = ((ESKeywords) => function reservePublicNames(target, `../${src}/table.js`, `../${src}/vector.js`, `../${src}/util/int.js`, + `../${src}/recordbatch.js`, `../${src}/${mainExport}.js`, ]; return publicModulePaths.reduce((keywords, publicModulePath) => [ diff --git a/js/gulp/util.js b/js/gulp/util.js index 679588d44205c..f35a447e70830 100644 --- a/js/gulp/util.js +++ b/js/gulp/util.js @@ -87,7 +87,7 @@ const ESKeywords = [ // EventTarget `addListener`, `removeListener`, `addEventListener`, `removeEventListener`, // Arrow properties - `low`, `high`, `data`, `index`, `field`, `validity`, `columns`, `fieldNode`, `subarray`, + `low`, `high`, `data`, `index`, `field`, `columns`, 'numCols', 'numRows', `values`, `valueOffsets`, `nullBitmap`, `subarray` ]; function taskName(target, format) { diff --git a/js/package.json b/js/package.json index deae744521df6..d569145da4cd0 100644 --- a/js/package.json +++ b/js/package.json @@ -12,6 +12,7 @@ "clean": "gulp clean", "debug": "gulp debug", "perf": "node ./perf/index.js", + "create:perfdata": "python ./test/data/tables/generate.py ./test/data/tables/tracks.arrow", "release": "./npm-release.sh", "clean:all": "run-p clean clean:testdata", "clean:testdata": "gulp clean:testdata", @@ -51,16 +52,15 @@ ], "dependencies": { "@types/text-encoding-utf-8": "1.0.1", - "command-line-args": "5.0.0", + "command-line-args": "5.0.1", "command-line-usage": "4.1.0", "flatbuffers": "trxcllnt/flatbuffers-esm", "json-bignum": "0.0.3", "text-encoding-utf-8": "^1.0.2", - "ts-node": "4.1.0", - "tslib": "1.8.1" + "tslib": "1.9.0" }, "devDependencies": { - "@std/esm": "0.19.6", + "@std/esm": "0.19.7", "@types/flatbuffers": "1.6.5", "@types/glob": "5.0.34", "@types/jest": "22.0.1", @@ -80,11 +80,11 @@ "gulp-transform-js-ast": "1.0.2", "gulp-typescript": "3.2.3", "ix": "2.3.4", - "jest": "22.1.2", + "jest": "22.1.3", "jest-environment-node-debug": "2.0.0", "json": "9.0.6", "lerna": "2.7.1", - "lint-staged": "6.0.0", + "lint-staged": "6.0.1", "merge2": "1.2.1", "mkdirp": "0.5.1", "npm-run-all": "4.1.2", @@ -130,7 +130,7 @@ "lcov" ], "coveragePathIgnorePatterns": [ - "format\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$", + "fb\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$", "test\\/.*\\.(ts|tsx|js)$", "/node_modules/" ], diff --git a/js/perf/index.js b/js/perf/index.js index 9eac40e64ac71..d276b7db9c236 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,29 +16,40 @@ // under the License. // Use the ES5 UMD target as perf baseline -// const { Table, readVectors } = require('../targets/es5/umd'); -// const { Table, readVectors } = require('../targets/es5/cjs'); -const { Table, readVectors } = require('../targets/es2015/umd'); -// const { Table, readVectors } = require('../targets/es2015/cjs'); +// const { col, Table, read: readBatches } = require('../targets/es5/umd'); +// const { col, Table, read: readBatches } = require('../targets/es5/cjs'); +// const { col, Table, read: readBatches } = require('../targets/es2015/umd'); +const { col, Table, read: readBatches } = require('../targets/es2015/cjs'); -const config = require('./config'); const Benchmark = require('benchmark'); const suites = []; -for (let { name, buffers} of config) { - const parseSuite = new Benchmark.Suite(`Parse ${name}`, { async: true }); - const sliceSuite = new Benchmark.Suite(`Slice ${name} vectors`, { async: true }); - const iterateSuite = new Benchmark.Suite(`Iterate ${name} vectors`, { async: true }); - const getByIndexSuite = new Benchmark.Suite(`Get ${name} values by index`, { async: true }); - parseSuite.add(createFromTableTest(name, buffers)); - parseSuite.add(createReadVectorsTest(name, buffers)); - for (const vector of Table.from(buffers).columns) { - sliceSuite.add(createSliceTest(vector)); - iterateSuite.add(createIterateTest(vector)); - getByIndexSuite.add(createGetByIndexTest(vector)); - } - suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); +for (let { name, buffers } of require('./table_config')) { + const parseSuiteName = `Parse "${name}"`; + const sliceSuiteName = `Slice "${name}" vectors`; + const iterateSuiteName = `Iterate "${name}" vectors`; + const getByIndexSuiteName = `Get "${name}" values by index`; + const sliceToArraySuiteName = `Slice toArray "${name}" vectors`; + suites.push(createTestSuite(parseSuiteName, createFromTableTest(name, buffers))); + suites.push(createTestSuite(parseSuiteName, createReadBatchesTest(name, buffers))); + const table = Table.from(buffers); + suites.push(...table.columns.map((vector, i) => createTestSuite(getByIndexSuiteName, createGetByIndexTest(vector, table.schema.fields[i].name)))); + suites.push(...table.columns.map((vector, i) => createTestSuite(iterateSuiteName, createIterateTest(vector, table.schema.fields[i].name)))); + suites.push(...table.columns.map((vector, i) => createTestSuite(sliceToArraySuiteName, createSliceToArrayTest(vector, table.schema.fields[i].name)))); + suites.push(...table.columns.map((vector, i) => createTestSuite(sliceSuiteName, createSliceTest(vector, table.schema.fields[i].name)))); +} + +for (let {name, buffers, countBys, counts} of require('./table_config')) { + const table = Table.from(buffers); + + const dfCountBySuiteName = `DataFrame Count By "${name}"`; + const dfFilterCountSuiteName = `DataFrame Filter-Scan Count "${name}"`; + const dfDirectCountSuiteName = `DataFrame Direct Count "${name}"`; + + suites.push(...countBys.map((countBy) => createTestSuite(dfCountBySuiteName, createDataFrameCountByTest(table, countBy)))); + suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfFilterCountSuiteName, createDataFrameFilterCountTest(table, col, test, value)))); + suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfDirectCountSuiteName, createDataFrameDirectCountTest(table, col, test, value)))); } console.log('Running apache-arrow performance tests...\n'); @@ -52,7 +63,7 @@ function run() { var str = x.toString(); var meanMsPerOp = Math.round(x.stats.mean * 100000)/100; var sliceOf60FPS = Math.round((meanMsPerOp / (1000/60)) * 100000)/1000; - return `${str} (avg: ${meanMsPerOp}ms, or ${sliceOf60FPS}% of a frame @ 60FPS) ${x.suffix || ''}`; + return `${str}\n avg: ${meanMsPerOp}ms\n ${sliceOf60FPS}% of a frame @ 60FPS ${x.suffix || ''}`; }).join('\n') + '\n'); if (suites.length > 0) { setTimeout(run, 1000); @@ -61,47 +72,60 @@ function run() { .run({ async: true }); } +function createTestSuite(name, test) { + return new Benchmark.Suite(name, { async: true }).add(test); +} + function createFromTableTest(name, buffers) { let table; return { async: true, - name: `Table.from`, + name: `Table.from\n`, fn() { table = Table.from(buffers); } }; } -function createReadVectorsTest(name, buffers) { - let vectors; +function createReadBatchesTest(name, buffers) { + let recordBatch; return { async: true, - name: `readVectors`, - fn() { for (vectors of readVectors(buffers)) {} } + name: `readBatches\n`, + fn() { for (recordBatch of readBatches(buffers)) {} } }; } -function createSliceTest(vector) { +function createSliceTest(vector, name) { let xs; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { xs = vector.slice(); } }; } -function createIterateTest(vector) { +function createSliceToArrayTest(vector, name) { + let xs; + return { + async: true, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, + fn() { xs = vector.slice().toArray(); } + }; +} + +function createIterateTest(vector, name) { let value; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { for (value of vector) {} } }; } -function createGetByIndexTest(vector) { +function createGetByIndexTest(vector, name) { let value; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { for (let i = -1, n = vector.length; ++i < n;) { value = vector.get(i); @@ -109,3 +133,80 @@ function createGetByIndexTest(vector) { } }; } + +function createDataFrameDirectCountTest(table, column, test, value) { + let sum, colidx = table.schema.fields.findIndex((c)=>c.name === column); + + if (test == 'gteq') { + op = function () { + sum = 0; + let batches = table.batches; + let numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const { numRows, columns } = batches[batchIndex]; + const vector = columns[colidx]; + // yield all indices + for (let index = -1; ++index < numRows;) { + sum += (vector.get(index) >= value); + } + } + } + } else if (test == 'eq') { + op = function() { + sum = 0; + let batches = table.batches; + let numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const { numRows, columns } = batches[batchIndex]; + const vector = columns[colidx]; + // yield all indices + for (let index = -1; ++index < numRows;) { + sum += (vector.get(index) === value); + } + } + } + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}\n`, + fn: op + }; +} + +function createDataFrameCountByTest(table, column) { + let colidx = table.schema.fields.findIndex((c)=> c.name === column); + + return { + async: true, + name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}\n`, + fn() { + table.countBy(column); + } + }; +} + +function createDataFrameFilterCountTest(table, column, test, value) { + let colidx = table.schema.fields.findIndex((c)=> c.name === column); + let df; + + if (test == 'gteq') { + df = table.filter(col(column).gteq(value)); + } else if (test == 'eq') { + df = table.filter(col(column).eq(value)); + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${column}', length: ${table.numRows}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}\n`, + fn() { + df.count(); + } + }; +} diff --git a/js/perf/table_config.js b/js/perf/table_config.js new file mode 100644 index 0000000000000..e3c332c870f38 --- /dev/null +++ b/js/perf/table_config.js @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const fs = require('fs'); +const path = require('path'); +const glob = require('glob'); + +const config = []; +const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`)); + +countBys = { + "tracks": ['origin', 'destination'] +} +counts = { + "tracks": [ + {col: 'lat', test: 'gteq', value: 0 }, + {col: 'lng', test: 'gteq', value: 0 }, + {col: 'origin', test: 'eq', value: 'Seattle'}, + ] +} + +for (const filename of filenames) { + const { name } = path.parse(filename); + if (name in counts) { + config.push({ + name, + buffers: [fs.readFileSync(filename)], + countBys: countBys[name], + counts: counts[name], + }); + } +} + +module.exports = config; diff --git a/js/src/Arrow.externs.js b/js/src/Arrow.externs.js index 174ddad40e3db..6c8a7e79f3b67 100644 --- a/js/src/Arrow.externs.js +++ b/js/src/Arrow.externs.js @@ -51,6 +51,28 @@ Table.prototype.toArray; Table.prototype.select; /** @type {?} */ Table.prototype.rowsToString; +/** @type {?} */ +Table.prototype.lengths; +/** @type {?} */ +Table.prototype.batches; +/** @type {?} */ +Table.prototype.countBy; +/** @type {?} */ +Table.prototype.scan; +/** @type {?} */ +Table.prototype.get; + +var CountByResult = function() {}; +/** @type {?} */ +CountByResult.prototype.asJSON; + +let Col = function() {}; +/** @type {?} */ +Col.prototype.gteq; +/** @type {?} */ +Col.prototype.lteq; +/** @type {?} */ +Col.prototype.eq; var TableToStringIterator = function() {}; /** @type {?} */ @@ -58,17 +80,17 @@ TableToStringIterator.prototype.pipe; var RecordBatch = function() {}; /** @type {?} */ +RecordBatch.from = function() {}; +/** @type {?} */ +RecordBatch.prototype.numCols; +/** @type {?} */ RecordBatch.prototype.numRows; /** @type {?} */ RecordBatch.prototype.schema; /** @type {?} */ -RecordBatch.prototype.data; -/** @type {?} */ RecordBatch.prototype.columns; /** @type {?} */ -RecordBatch.prototype.numCols; -/** @type {?} */ -RecordBatch.prototype.concat; +RecordBatch.prototype.select; var Vector = function() {}; /** @type {?} */ @@ -152,6 +174,50 @@ Int128.prototype.plus /** @type {?} */ Int128.prototype.hex +var Type = function() {}; +/** @type {?} */ +Type.NONE = function() {}; +/** @type {?} */ +Type.Null = function() {}; +/** @type {?} */ +Type.Int = function() {}; +/** @type {?} */ +Type.Float = function() {}; +/** @type {?} */ +Type.Binary = function() {}; +/** @type {?} */ +Type.Utf8 = function() {}; +/** @type {?} */ +Type.Bool = function() {}; +/** @type {?} */ +Type.Decimal = function() {}; +/** @type {?} */ +Type.Date = function() {}; +/** @type {?} */ +Type.Time = function() {}; +/** @type {?} */ +Type.Timestamp = function() {}; +/** @type {?} */ +Type.Interval = function() {}; +/** @type {?} */ +Type.List = function() {}; +/** @type {?} */ +Type.Struct = function() {}; +/** @type {?} */ +Type.Union = function() {}; +/** @type {?} */ +Type.FixedSizeBinary = function() {}; +/** @type {?} */ +Type.FixedSizeList = function() {}; +/** @type {?} */ +Type.Map = function() {}; +/** @type {?} */ +Type.Dictionary = function() {}; +/** @type {?} */ +Type.DenseUnion = function() {}; +/** @type {?} */ +Type.SparseUnion = function() {}; + var DataType = function() {}; /** @type {?} */ DataType.isNull = function() {}; @@ -197,7 +263,25 @@ DataType.isDictionary = function() {}; DataType.prototype.ArrayType; var Schema = function() {}; +/** @type {?} */ +Schema.from = function() {}; +/** @type {?} */ +Schema.prototype.fields; +/** @type {?} */ +Schema.prototype.version; +/** @type {?} */ +Schema.prototype.metadata; +/** @type {?} */ +Schema.prototype.dictionaries; var Field = function() {}; +/** @type {?} */ +Field.prototype.name; +/** @type {?} */ +Field.prototype.type; +/** @type {?} */ +Field.prototype.nullable; +/** @type {?} */ +Field.prototype.metadata; var Null = function() {}; var Int8 = function() {}; var Int16 = function() {}; @@ -309,8 +393,18 @@ NestedVector.prototype.getChildAt; var NullVector = function() {}; var BoolVector = function() {}; +/** @type {?} */ +BoolVector.from = function() {}; +/** @type {?} */ +BoolVector.prototype.values; var IntVector = function() {}; +/** @type {?} */ +IntVector.from = function() {}; + var FloatVector = function() {}; +/** @type {?} */ +FloatVector.from = function() {}; + var DateVector = function() {}; var DecimalVector = function() {}; var TimeVector = function() {}; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 911ffb796f213..f890d8232f18c 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -17,11 +17,13 @@ import * as type_ from './type'; import * as data_ from './data'; -import { Vector } from './vector'; import * as vector_ from './vector'; import * as util_ from './util/int'; +import { Vector } from './vector'; +import { RecordBatch } from './recordbatch'; import { Schema, Field, Type } from './type'; -import { Table, RecordBatch } from './recordbatch'; +import { Table, CountByResult } from './table'; +import { lit, col, Col, Value } from './predicate'; import { read, readAsync } from './ipc/reader/arrow'; export import View = vector_.View; @@ -32,7 +34,9 @@ export import TimeBitWidth = type_.TimeBitWidth; export import TypedArrayConstructor = type_.TypedArrayConstructor; export { read, readAsync }; -export { Field, Schema, Table, RecordBatch, Vector, Type }; +export { Table, CountByResult }; +export { lit, col, Col, Value }; +export { Field, Schema, RecordBatch, Vector, Type }; export namespace util { export import Uint64 = util_.Uint64; @@ -125,23 +129,31 @@ try { Arrow['read'] = read; Arrow['readAsync'] = readAsync; + Arrow['Type'] = Type; Arrow['Field'] = Field; Arrow['Schema'] = Schema; - Arrow['Table'] = Table; - Arrow['RecordBatch'] = RecordBatch; Arrow['Vector'] = Vector; + Arrow['RecordBatch'] = RecordBatch; + Arrow['Table'] = Table; + Arrow['CountByResult'] = CountByResult; + Arrow['Value'] = Value; + Arrow['lit'] = lit; + Arrow['col'] = col; + Arrow['Col'] = Col; } } catch (e) { /* not the UMD bundle */ } /* end umd exports */ -// closure compiler always erases static method names: +// closure compiler erases static properties/methods: // https://github.com/google/closure-compiler/issues/1776 // set them via string indexers to save them from the mangler +Schema['from'] = Schema.from; Table['from'] = Table.from; Table['fromAsync'] = Table.fromAsync; Table['empty'] = Table.empty; Vector['create'] = Vector.create; +RecordBatch['from'] = RecordBatch.from; util_.Uint64['add'] = util_.Uint64.add; util_.Uint64['multiply'] = util_.Uint64.multiply; @@ -156,6 +168,28 @@ util_.Int128['fromString'] = util_.Int128.fromString; data_.ChunkedData['computeOffsets'] = data_.ChunkedData.computeOffsets; +(type_.Type as any)['NONE'] = type_.Type.NONE; +(type_.Type as any)['Null'] = type_.Type.Null; +(type_.Type as any)['Int'] = type_.Type.Int; +(type_.Type as any)['Float'] = type_.Type.Float; +(type_.Type as any)['Binary'] = type_.Type.Binary; +(type_.Type as any)['Utf8'] = type_.Type.Utf8; +(type_.Type as any)['Bool'] = type_.Type.Bool; +(type_.Type as any)['Decimal'] = type_.Type.Decimal; +(type_.Type as any)['Date'] = type_.Type.Date; +(type_.Type as any)['Time'] = type_.Type.Time; +(type_.Type as any)['Timestamp'] = type_.Type.Timestamp; +(type_.Type as any)['Interval'] = type_.Type.Interval; +(type_.Type as any)['List'] = type_.Type.List; +(type_.Type as any)['Struct'] = type_.Type.Struct; +(type_.Type as any)['Union'] = type_.Type.Union; +(type_.Type as any)['FixedSizeBinary'] = type_.Type.FixedSizeBinary; +(type_.Type as any)['FixedSizeList'] = type_.Type.FixedSizeList; +(type_.Type as any)['Map'] = type_.Type.Map; +(type_.Type as any)['Dictionary'] = type_.Type.Dictionary; +(type_.Type as any)['DenseUnion'] = type_.Type.DenseUnion; +(type_.Type as any)['SparseUnion'] = type_.Type.SparseUnion; + type_.DataType['isNull'] = type_.DataType.isNull; type_.DataType['isInt'] = type_.DataType.isInt; type_.DataType['isFloat'] = type_.DataType.isFloat; @@ -176,3 +210,7 @@ type_.DataType['isFixedSizeBinary'] = type_.DataType.isFixedSizeBinary; type_.DataType['isFixedSizeList'] = type_.DataType.isFixedSizeList; type_.DataType['isMap'] = type_.DataType.isMap; type_.DataType['isDictionary'] = type_.DataType.isDictionary; + +vector_.BoolVector['from'] = vector_.BoolVector.from; +vector_.IntVector['from'] = vector_.IntVector.from; +vector_.FloatVector['from'] = vector_.FloatVector.from; diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 86536b5a6ffee..ee9561323785b 100644 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -84,11 +84,12 @@ if (!files.length) { } files.forEach((source) => { - // debugger; + debugger; let table: Arrow.Table, input = fs.readFileSync(source); try { table = Arrow.Table.from(input); } catch (e) { + debugger; table = Arrow.Table.from(parse(input + '')); } if (argv.schema && argv.schema.length) { diff --git a/js/src/data.ts b/js/src/data.ts index f68d1e3faf378..83ae071bc97ff 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -22,6 +22,7 @@ import { Int, Bool, FlatListType, List, FixedSizeList, Struct, Map_ } from './ty import { DataType, FlatType, ListType, NestedType, DenseUnion, SparseUnion } from './type'; export function toTypedArray(ArrayType: TypedArrayConstructor, values?: T | ArrayLike | Iterable | null): T { + if (!ArrayType && ArrayBuffer.isView(values)) { return values; } return values instanceof ArrayType ? values : !values || !ArrayBuffer.isView(values) ? ArrayType.from(values || []) : new ArrayType(values.buffer, values.byteOffset, values.byteLength / ArrayType.BYTES_PER_ELEMENT); diff --git a/js/src/predicate.ts b/js/src/predicate.ts new file mode 100644 index 0000000000000..1553bca7d1903 --- /dev/null +++ b/js/src/predicate.ts @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { RecordBatch } from './recordbatch'; +import { Vector, DictionaryVector } from './vector'; + +export type ValueFunc = (idx: number, cols: RecordBatch) => T | null; +export type PredicateFunc = (idx: number, cols: RecordBatch) => boolean; + +export abstract class Value { + eq(other: Value | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new Equals(this, other); + } + lteq(other: Value | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new LTeq(this, other); + } + gteq(other: Value | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new GTeq(this, other); + } +} + +export class Literal extends Value { + constructor(public v: T) { super(); } +} + +export class Col extends Value { + // @ts-ignore + public vector: Vector; + // @ts-ignore + public colidx: number; + + constructor(public name: string) { super(); } + bind(batch: RecordBatch) { + if (!this.colidx) { + // Assume column index doesn't change between calls to bind + //this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1); + this.colidx = -1; + const fields = batch.schema.fields; + for (let idx = -1; ++idx < fields.length;) { + if (fields[idx].name === this.name) { + this.colidx = idx; + break; + } + } + if (this.colidx < 0) { throw new Error(`Failed to bind Col "${this.name}"`); } + } + this.vector = batch.columns[this.colidx]; + return this.vector.get.bind(this.vector); + } + + emitString() { return `cols[${this.colidx}].get(idx)`; } +} + +export abstract class Predicate { + abstract bind(batch: RecordBatch): PredicateFunc; + and(expr: Predicate): Predicate { return new And(this, expr); } + or(expr: Predicate): Predicate { return new Or(this, expr); } + ands(): Predicate[] { return [this]; } +} + +export abstract class ComparisonPredicate extends Predicate { + constructor(public readonly left: Value, public readonly right: Value) { + super(); + } + + bind(batch: RecordBatch) { + if (this.left instanceof Literal) { + if (this.right instanceof Literal) { + return this._bindLitLit(batch, this.left, this.right); + } else { // right is a Col + + return this._bindColLit(batch, this.right as Col, this.left); + } + } else { // left is a Col + if (this.right instanceof Literal) { + return this._bindColLit(batch, this.left as Col, this.right); + } else { // right is a Col + return this._bindColCol(batch, this.left as Col, this.right as Col); + } + } + } + + protected abstract _bindLitLit(batch: RecordBatch, left: Literal, right: Literal): PredicateFunc; + protected abstract _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc; + protected abstract _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc; +} + +abstract class CombinationPredicate extends Predicate { + constructor(public readonly left: Predicate, public readonly right: Predicate) { + super(); + } +} + +class And extends CombinationPredicate { + bind(batch: RecordBatch) { + const left = this.left.bind(batch); + const right = this.right.bind(batch); + return (idx: number, batch: RecordBatch) => left(idx, batch) && right(idx, batch); + } + ands(): Predicate[] { return this.left.ands().concat(this.right.ands()); } +} + +class Or extends CombinationPredicate { + bind(batch: RecordBatch) { + const left = this.left.bind(batch); + const right = this.right.bind(batch); + return (idx: number, batch: RecordBatch) => left(idx, batch) || right(idx, batch); + } +} + +export class Equals extends ComparisonPredicate { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v == right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, batch: RecordBatch) => left_func(idx, batch) == right_func(idx, batch); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + if (col.vector instanceof DictionaryVector) { + // Assume that there is only one key with the value `lit.v` + // TODO: add lazily-computed reverse dictionary lookups, associated + // with col.vector.data so that we only have to do this once per + // dictionary + let key = -1; + let dict = col.vector; + let data = dict.dictionary!; + for (let len = data.length; ++key < len;) { + if (data.get(key) === lit.v) { + break; + } + } + + if (key == data.length) { + // the value doesn't exist in the dictionary - always return + // false + // TODO: special-case of PredicateFunc that encapsulates this + // "always false" behavior. That way filtering operations don't + // have to bother checking + return () => false; + } else { + return (idx: number) => { + return dict.getKey(idx) === key; + }; + } + } else { + return (idx: number, cols: RecordBatch) => col_func(idx, cols) == lit.v; + } + } +} + +export class LTeq extends ComparisonPredicate { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v <= right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, cols: RecordBatch) => left_func(idx, cols) <= right_func(idx, cols); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => col_func(idx, cols) <= lit.v; + } +} + +export class GTeq extends ComparisonPredicate { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v >= right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, cols: RecordBatch) => left_func(idx, cols) >= right_func(idx, cols); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => col_func(idx, cols) >= lit.v; + } +} + +export function lit(n: number): Value { return new Literal(n); } +export function col(n: string): Col { return new Col(n); } diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts index 2bf5da367fbf5..3cc375510e23e 100644 --- a/js/src/recordbatch.ts +++ b/js/src/recordbatch.ts @@ -15,176 +15,64 @@ // specific language governing permissions and limitations // under the License. -import { Data, NestedData } from './data'; -import { Schema, DataType, Struct, IterableArrayLike } from './type'; +import { Schema, Struct } from './type'; import { flatbuffers } from 'flatbuffers'; -import { StructView, RowView } from './vector/nested'; -import { read, readAsync } from './ipc/reader/arrow'; -import { View, Vector, createVector } from './vector'; -import { isPromise, isAsyncIterable } from './util/compat'; +import { View, Vector, StructVector } from './vector'; +import { Data, NestedData, ChunkedData } from './data'; import Long = flatbuffers.Long; -export class RecordBatch { - public readonly numRows: number; - public readonly schema: Schema; - public readonly data: Data[]; - public readonly columns: Vector[]; - constructor(schema: Schema, numRows: Long | number, columnsOrData: (Data | Vector)[]) { - const data: Data[] = new Array(columnsOrData.length); - const columns: Vector[] = new Array(columnsOrData.length); - for (let index = -1, length = columnsOrData.length; ++index < length;) { - const col: Data | Vector = columnsOrData[index]; - if (col instanceof Vector) { - data[index] = (columns[index] = col as Vector).data; - } else { - columns[index] = createVector(data[index] = col); - } - } - this.data = data; - this.schema = schema; - this.columns = columns; - this.numRows = typeof numRows === 'number' ? numRows : numRows.low; - } - public get numCols() { return this.columns.length; } - public concat(...others: RecordBatch[]): RecordBatch { - return new RecordBatch( - this.schema, - others.reduce((numRows, batch) => numRows + batch.numRows, this.numRows), - others.reduce((columns, batch) => - columns.map((col, idx) => col.concat(batch.columns[idx])), - this.columns - ) +export class RecordBatch extends StructVector { + public static from(vectors: Vector[]) { + return new RecordBatch(Schema.from(vectors), + Math.max(...vectors.map((v) => v.length)), + vectors ); } -} - -export class Table { - - public static from = syncTableFromInputs; - public static fromAsync = asyncTableFromInputs; - public static empty() { return new Table(new Schema([]), []); } - - protected _view: View; public readonly schema: Schema; + public readonly numCols: number; + public readonly numRows: number; public readonly columns: Vector[]; - - constructor(schema: Schema, columns: Vector[]) { - this.schema = schema; - this.columns = columns; - this._view = new StructView( - new NestedData( - new Struct(schema.fields), - this.numRows, new Uint8Array(0), - columns.map((col) => col.data) - ), - columns - ); + constructor(schema: Schema, data: Data, view: View); + constructor(schema: Schema, numRows: Long | number, cols: Data | Vector[]); + constructor(...args: any[]) { + if (typeof args[1] !== 'number') { + const data = args[1] as Data; + super(data, args[2]); + this.schema = args[0]; + this.numRows = data.length; + this.numCols = this.schema.fields.length; + this.columns = data instanceof ChunkedData + ? data.childVectors + : data.childData.map((col) => Vector.create(col)); + } else { + const [schema, numRows, cols] = args; + const columns: Vector[] = new Array(cols.length); + const columnsData: Data[] = new Array(cols.length); + for (let index = -1, length = cols.length; ++index < length;) { + const col: Data | Vector = cols[index]; + if (col instanceof Vector) { + columnsData[index] = (columns[index] = col as Vector).data; + } else { + columns[index] = Vector.create(columnsData[index] = col); + } + } + super(new NestedData(new Struct(schema.fields), numRows, null, columnsData)); + this.schema = schema; + this.columns = columns; + this.numRows = numRows; + this.numCols = schema.fields.length; + } } - public get numCols() { return this.columns.length; } - public get numRows() { return this.columns[0].length; } - public get(index: number) { return this._view.get(index); } - public toArray(): IterableArrayLike { return this._view.toArray(); } - public [Symbol.iterator](): IterableIterator { - return this._view[Symbol.iterator](); + public clone(data: Data, view: View = this.view.clone(data)): this { + return new RecordBatch(this.schema, data, view) as this; } public select(...columnNames: string[]) { const fields = this.schema.fields; const namesToKeep = columnNames.reduce((xs, x) => (xs[x] = true) && xs, Object.create(null)); - return new Table( - this.schema.select(...columnNames), + return new RecordBatch( + this.schema.select(...columnNames), this.numRows, this.columns.filter((_, index) => namesToKeep[fields[index].name]) ); } - public rowsToString(separator = ' | '): TableToStringIterator { - return new TableToStringIterator(tableRowsToString(this, separator)); - } -} - -export function syncTableFromInputs(sources?: Iterable | object | string) { - let schema: Schema | undefined, columns: Vector[] = []; - if (sources) { - for (let recordBatch of read(sources)) { - schema = schema || recordBatch.schema; - columns = concatVectors(columns, recordBatch.columns); - } - return new Table(schema!, columns); - } - return Table.empty(); -} - -export async function* asyncTableFromInputs(sources?: Iterable | AsyncIterable | Promise | object | string) { - let columns: Vector[] = []; - let schema: Schema | undefined; - if (isAsyncIterable(sources)) { - for await (let recordBatch of readAsync(sources)) { - schema = schema || recordBatch.schema; - columns = concatVectors(columns, recordBatch.columns); - } - return new Table(schema!, columns); - } else if (isPromise(sources)) { - return Table.from(await sources); - } else if (sources) { - return Table.from(sources); - } - return Table.empty(); -} - -export class TableToStringIterator implements IterableIterator { - constructor(private iterator: IterableIterator) {} - [Symbol.iterator]() { return this.iterator; } - next(value?: any) { return this.iterator.next(value); } - throw(error?: any) { return this.iterator.throw && this.iterator.throw(error) || { done: true, value: '' }; } - return(value?: any) { return this.iterator.return && this.iterator.return(value) || { done: true, value: '' }; } - pipe(stream: NodeJS.WritableStream) { - let res: IteratorResult; - let write = () => { - if (stream.writable) { - do { - if ((res = this.next()).done) { break; } - } while (stream.write(res.value + '\n', 'utf8')); - } - if (!res || !res.done) { - stream.once('drain', write); - } else if (!(stream as any).isTTY) { - stream.end('\n'); - } - }; - write(); - } -} - -function *tableRowsToString(table: Table, separator = ' | ') { - const fields = table.schema.fields; - const header = ['row_id', ...fields.map((f) => `${f}`)].map(stringify); - const maxColumnWidths = header.map(x => x.length); - // Pass one to convert to strings and count max column widths - for (let i = -1, n = table.numRows - 1; ++i < n;) { - let val, row = [i, ...table.get(i)]; - for (let j = -1, k = row.length; ++j < k; ) { - val = stringify(row[j]); - maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); - } - } - yield header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(separator); - for (let i = -1, n = table.numRows; ++i < n;) { - yield [i, ...table.get(i)] - .map((x) => stringify(x)) - .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) - .join(separator); - } -} - -function concatVectors(tableVectors: Vector[], batchVectors: Vector[]) { - return tableVectors.length === 0 ? batchVectors : batchVectors.map((vec, i, _vs, col = tableVectors[i]) => - vec && col && col.concat(vec) || col || vec - ) as Vector[]; -} - -function leftPad(str: string, fill: string, n: number) { - return (new Array(n + 1).join(fill) + str).slice(-1 * n); -} - -function stringify(x: any) { - return typeof x === 'string' ? `"${x}"` : ArrayBuffer.isView(x) ? `[${x}]` : JSON.stringify(x); } diff --git a/js/src/table.ts b/js/src/table.ts new file mode 100644 index 0000000000000..4bf397631fb84 --- /dev/null +++ b/js/src/table.ts @@ -0,0 +1,312 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { RecordBatch } from './recordbatch'; +import { Col, Predicate } from './predicate'; +import { Schema, Field, Struct } from './type'; +import { read, readAsync } from './ipc/reader/arrow'; +import { isPromise, isAsyncIterable } from './util/compat'; +import { Vector, DictionaryVector, IntVector } from './vector'; + +export type NextFunc = (idx: number, cols: RecordBatch) => void; + +export interface DataFrame { + filter(predicate: Predicate): DataFrame; + scan(next: NextFunc): void; + count(): number; + countBy(col: (Col|string)): CountByResult; +} + +export class Table implements DataFrame { + static empty() { return new Table(new Schema([]), []); } + static from(sources?: Iterable | object | string) { + if (sources) { + let schema: Schema | undefined; + let recordBatches: RecordBatch[] = []; + for (let recordBatch of read(sources)) { + schema = schema || recordBatch.schema; + recordBatches.push(recordBatch); + } + return new Table(schema || new Schema([]), recordBatches); + } + return Table.empty(); + } + static async fromAsync(sources?: AsyncIterable) { + if (isAsyncIterable(sources)) { + let schema: Schema | undefined; + let recordBatches: RecordBatch[] = []; + for await (let recordBatch of readAsync(sources)) { + schema = schema || recordBatch.schema; + recordBatches.push(recordBatch); + } + return new Table(schema || new Schema([]), recordBatches); + } else if (isPromise(sources)) { + return Table.from(await sources); + } else if (sources) { + return Table.from(sources); + } + return Table.empty(); + } + + public readonly schema: Schema; + public readonly numCols: number; + public readonly numRows: number; + // List of inner RecordBatches + public readonly batches: RecordBatch[]; + // List of inner Vectors, possibly spanning batches + public readonly columns: Vector[]; + // Union of all inner RecordBatches into one RecordBatch, possibly chunked. + // If the Table has just one inner RecordBatch, this points to that. + // If the Table has multiple inner RecordBatches, then this is a Chunked view + // over the list of RecordBatches. This allows us to delegate the responsibility + // of indexing, iterating, slicing, and visiting to the Nested/Chunked Data/Views. + public readonly batchesUnion: RecordBatch; + + constructor(batches: RecordBatch[]); + constructor(...batches: RecordBatch[]); + constructor(schema: Schema, batches: RecordBatch[]); + constructor(schema: Schema, ...batches: RecordBatch[]); + constructor(...args: any[]) { + let schema: Schema; + let batches: RecordBatch[]; + if (args[0] instanceof Schema) { + schema = args[0]; + batches = Array.isArray(args[1][0]) ? args[1][0] : args[1]; + } else if (args[0] instanceof RecordBatch) { + schema = (batches = args)[0].schema; + } else { + schema = (batches = args[0])[0].schema; + } + this.schema = schema; + this.batches = batches; + this.batchesUnion = batches.reduce((union, batch) => union.concat(batch)); + this.columns = batches.slice(1).reduce((columns, batch) => + columns.map((col, idx) => col.concat(batch.columns[idx])), + batches[0].columns + ); + this.numCols = this.batchesUnion.numCols; + this.numRows = this.batchesUnion.numRows; + } + public get(index: number): Struct['TValue'] { + return this.batchesUnion.get(index)!; + } + public [Symbol.iterator](): IterableIterator { + return this.batchesUnion[Symbol.iterator]() as any; + } + public filter(predicate: Predicate): DataFrame { + return new FilteredDataFrame(this.batches, predicate); + } + public scan(next: NextFunc) { + const batches = this.batches, numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // yield all indices + for (let index = -1, numRows = batch.numRows; ++index < numRows;) { + next(index, batch); + } + } + } + public count(): number { return this.batchesUnion.length; } + public countBy(name: Col | string): CountByResult { + const batches = this.batches, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as DictionaryVector; + if (!(vector instanceof DictionaryVector)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + // TODO: Adjust array byte width based on overall length + // (e.g. if this.length <= 255 use Uint8Array, etc...) + const counts: Uint32Array = new Uint32Array(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as DictionaryVector).indicies; + // yield all indices + for (let index = -1, numRows = batch.numRows; ++index < numRows;) { + let key = keys.get(index); + if (key !== null) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } + public select(...columnNames: string[]) { + return new Table(this.batches.map((batch) => batch.select(...columnNames))); + } + public rowsToString(separator = ' | '): TableToStringIterator { + return new TableToStringIterator(tableRowsToString(this, separator)); + } +} + +class FilteredDataFrame implements DataFrame { + private predicate: Predicate; + private batches: RecordBatch[]; + constructor (batches: RecordBatch[], predicate: Predicate) { + this.batches = batches; + this.predicate = predicate; + } + public scan(next: NextFunc) { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + const batches = this.batches; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this.predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.numRows; ++index < numRows;) { + if (predicate(index, batch)) { next(index, batch); } + } + } + } + public count(): number { + // inlined version of this: + // let sum = 0; + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) ++sum; + // }); + // return sum; + let sum = 0; + const batches = this.batches; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this.predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.numRows; ++index < numRows;) { + if (predicate(index, batch)) { ++sum; } + } + } + return sum; + } + public filter(predicate: Predicate): DataFrame { + return new FilteredDataFrame( + this.batches, + this.predicate.and(predicate) + ); + } + public countBy(name: Col | string): CountByResult { + const batches = this.batches, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as DictionaryVector; + if (!(vector instanceof DictionaryVector)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + // TODO: Adjust array byte width based on overall length + // (e.g. if this.length <= 255 use Uint8Array, etc...) + const counts: Uint32Array = new Uint32Array(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this.predicate.bind(batch); + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as DictionaryVector).indicies; + // yield all indices + for (let index = -1, numRows = batch.numRows; ++index < numRows;) { + let key = keys.get(index); + if (key !== null && predicate(index, batch)) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } +} + +export class CountByResult extends Table implements DataFrame { + constructor(values: Vector, counts: IntVector) { + super( + new RecordBatch(new Schema([ + new Field('values', values.type), + new Field('counts', counts.type) + ]), + counts.length, [values, counts] + )); + } + public asJSON(): Object { + const [values, counts] = this.columns; + const result = {} as { [k: string]: number | null }; + for (let i = -1; ++i < this.numRows;) { + result[values.get(i)] = counts.get(i); + } + return result; + } +} + +export class TableToStringIterator implements IterableIterator { + constructor(private iterator: IterableIterator) {} + [Symbol.iterator]() { return this.iterator; } + next(value?: any) { return this.iterator.next(value); } + throw(error?: any) { return this.iterator.throw && this.iterator.throw(error) || { done: true, value: '' }; } + return(value?: any) { return this.iterator.return && this.iterator.return(value) || { done: true, value: '' }; } + pipe(stream: NodeJS.WritableStream) { + let res: IteratorResult; + let write = () => { + if (stream.writable) { + do { + if ((res = this.next()).done) { break; } + } while (stream.write(res.value + '\n', 'utf8')); + } + if (!res || !res.done) { + stream.once('drain', write); + } else if (!(stream as any).isTTY) { + stream.end('\n'); + } + }; + write(); + } +} + +function *tableRowsToString(table: Table, separator = ' | ') { + const fields = table.schema.fields; + const header = ['row_id', ...fields.map((f) => `${f}`)].map(stringify); + const maxColumnWidths = header.map(x => x.length); + // Pass one to convert to strings and count max column widths + for (let i = -1, n = table.numRows - 1; ++i < n;) { + let val, row = [i, ...table.get(i)]; + for (let j = -1, k = row.length; ++j < k; ) { + val = stringify(row[j]); + maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); + } + } + yield header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(separator); + for (let i = -1, n = table.numRows; ++i < n;) { + yield [i, ...table.get(i)] + .map((x) => stringify(x)) + .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) + .join(separator); + } +} + +function leftPad(str: string, fill: string, n: number) { + return (new Array(n + 1).join(fill) + str).slice(-1 * n); +} + +function stringify(x: any) { + return typeof x === 'string' ? `"${x}"` : ArrayBuffer.isView(x) ? `[${x}]` : JSON.stringify(x); +} diff --git a/js/src/type.ts b/js/src/type.ts index 06293bcc50238..95d71ca3914b7 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -34,6 +34,9 @@ export import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; export import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; export class Schema { + public static from(vectors: Vector[]) { + return new Schema(vectors.map((v, i) => new Field('' + i, v.type))); + } // @ts-ignore protected _bodyLength: number; // @ts-ignore @@ -208,92 +211,50 @@ export class Null extends DataType { } export interface Int extends DataType { TArray: TArrayType; TValue: TValueType; } -export abstract class Int extends DataType { +export class Int extends DataType { constructor(public readonly isSigned: boolean, public readonly bitWidth: IntBitWidth) { super(Type.Int); } - // @ts-ignore - public readonly ArrayType: TypedArrayConstructor; + public get ArrayType(): TypedArrayConstructor { + switch (this.bitWidth) { + case 8: return (this.isSigned ? Int8Array : Uint8Array) as any; + case 16: return (this.isSigned ? Int16Array : Uint16Array) as any; + case 32: return (this.isSigned ? Int32Array : Uint32Array) as any; + case 64: return (this.isSigned ? Int32Array : Uint32Array) as any; + } + throw new Error(`Unrecognized ${this[Symbol.toStringTag]} type`); + } public toString() { return `${this.isSigned ? `I` : `Ui`}nt${this.bitWidth}`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitInt(this); } protected static [Symbol.toStringTag] = ((proto: Int) => { - ( proto).ArrayType = Uint8Array; return proto[Symbol.toStringTag] = 'Int'; })(Int.prototype); } -export class Int8 extends Int { - constructor() { super(true, 8); } - protected static [Symbol.toStringTag] = ((proto: Int8) => { - ( proto).ArrayType = Int8Array; - return proto[Symbol.toStringTag] = 'Int8'; - })(Int8.prototype); -} - -export class Int16 extends Int { - constructor() { super(true, 16); } - protected static [Symbol.toStringTag] = ((proto: Int16) => { - ( proto).ArrayType = Int16Array; - return proto[Symbol.toStringTag] = 'Int16'; - })(Int16.prototype); -} - -export class Int32 extends Int { - constructor() { super(true, 32); } - protected static [Symbol.toStringTag] = ((proto: Int32) => { - ( proto).ArrayType = Int32Array; - return proto[Symbol.toStringTag] = 'Int32'; - })(Int32.prototype); -} - -export class Int64 extends Int { - constructor() { super(true, 64); } - protected static [Symbol.toStringTag] = ((proto: Int64) => { - ( proto).ArrayType = Int32Array; - return proto[Symbol.toStringTag] = 'Int64'; - })(Int64.prototype); -} - -export class Uint8 extends Int { - constructor() { super(false, 8); } - protected static [Symbol.toStringTag] = ((proto: Uint8) => { - ( proto).ArrayType = Uint8Array; - return proto[Symbol.toStringTag] = 'Uint8'; - })(Uint8.prototype); -} - -export class Uint16 extends Int { - constructor() { super(false, 16); } - protected static [Symbol.toStringTag] = ((proto: Uint16) => { - ( proto).ArrayType = Uint16Array; - return proto[Symbol.toStringTag] = 'Uint16'; - })(Uint16.prototype); -} - -export class Uint32 extends Int { - constructor() { super(false, 32); } - protected static [Symbol.toStringTag] = ((proto: Uint32) => { - ( proto).ArrayType = Uint32Array; - return proto[Symbol.toStringTag] = 'Uint32'; - })(Uint32.prototype); -} - -export class Uint64 extends Int { - constructor() { super(false, 64); } - protected static [Symbol.toStringTag] = ((proto: Uint64) => { - ( proto).ArrayType = Uint32Array; - return proto[Symbol.toStringTag] = 'Uint64'; - })(Uint64.prototype); -} +export class Int8 extends Int { constructor() { super(true, 8); } } +export class Int16 extends Int { constructor() { super(true, 16); } } +export class Int32 extends Int { constructor() { super(true, 32); } } +export class Int64 extends Int { constructor() { super(true, 64); } } +export class Uint8 extends Int { constructor() { super(false, 8); } } +export class Uint16 extends Int { constructor() { super(false, 16); } } +export class Uint32 extends Int { constructor() { super(false, 32); } } +export class Uint64 extends Int { constructor() { super(false, 64); } } export interface Float extends DataType { TArray: TArrayType; TValue: number; } -export abstract class Float extends DataType { +export class Float extends DataType { constructor(public readonly precision: Precision) { super(Type.Float); } // @ts-ignore - public readonly ArrayType: TypedArrayConstructor; + public get ArrayType(): TypedArrayConstructor { + switch (this.precision) { + case Precision.HALF: return Uint16Array as any; + case Precision.SINGLE: return Float32Array as any; + case Precision.DOUBLE: return Float64Array as any; + } + throw new Error(`Unrecognized ${this[Symbol.toStringTag]} type`); + } public toString() { return `Float${(this.precision << 5) || 16}`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFloat(this); } protected static [Symbol.toStringTag] = ((proto: Float) => { @@ -301,32 +262,9 @@ export abstract class Float extends })(Float.prototype); } -export interface Float16 extends Float {} -export class Float16 extends Float { - constructor() { super(Precision.HALF); } - protected static [Symbol.toStringTag] = ((proto: Float16) => { - ( proto).ArrayType = Uint16Array; - return proto[Symbol.toStringTag] = 'Float16'; - })(Float16.prototype); -} - -export interface Float32 extends Float {} -export class Float32 extends Float { - constructor() { super(Precision.SINGLE); } - protected static [Symbol.toStringTag] = ((proto: Float32) => { - ( proto).ArrayType = Float32Array; - return proto[Symbol.toStringTag] = 'Float32'; - })(Float32.prototype); -} - -export interface Float64 extends Float {} -export class Float64 extends Float { - constructor() { super(Precision.DOUBLE); } - protected static [Symbol.toStringTag] = ((proto: Float64) => { - ( proto).ArrayType = Float64Array; - return proto[Symbol.toStringTag] = 'Float64'; - })(Float64.prototype); -} +export class Float16 extends Float { constructor() { super(Precision.HALF); } } +export class Float32 extends Float { constructor() { super(Precision.SINGLE); } } +export class Float64 extends Float { constructor() { super(Precision.DOUBLE); } } export interface Binary extends DataType { TArray: Uint8Array; TValue: Uint8Array; } export class Binary extends DataType { diff --git a/js/src/vector.ts b/js/src/vector.ts index 59ff4da5a786a..f4f8060396547 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -import { Data, ChunkedData } from './data'; +import { Data, ChunkedData, FlatData, BoolData } from './data'; import { VisitorNode, TypeVisitor, VectorVisitor } from './visitor'; import { DataType, ListType, FlatType, NestedType, FlatListType } from './type'; import { IterableArrayLike, Precision, DateUnit, IntervalUnit, UnionMode } from './type'; @@ -35,73 +35,70 @@ export class Vector implements VectorLike, View, Vi public static create(data: Data): Vector { return createVector(data); } - // @ts-ignore - protected _data: Data; - // @ts-ignore - protected _view: View; + public readonly data: Data; + public readonly view: View; constructor(data: Data, view: View) { - this._data = data; + this.data = data; let nulls: Uint8Array; if (( data instanceof ChunkedData) && !(view instanceof ChunkedView)) { - this._view = new ChunkedView(data); + this.view = new ChunkedView(data); } else if (!(view instanceof ValidityView) && (nulls = data.nullBitmap!) && nulls.length > 0 && data.nullCount > 0) { - this._view = new ValidityView(data, view); + this.view = new ValidityView(data, view); } else { - this._view = view; + this.view = view; } } - public get data() { return this._data; } - public get type() { return this._data.type; } - public get length() { return this._data.length; } - public get nullCount() { return this._data.nullCount; } - public get nullBitmap() { return this._data.nullBitmap; } + public get type() { return this.data.type; } + public get length() { return this.data.length; } + public get nullCount() { return this.data.nullCount; } + public get nullBitmap() { return this.data.nullBitmap; } public get [Symbol.toStringTag]() { return `Vector<${this.type[Symbol.toStringTag]}>`; } - public toJSON() { return this.toArray(); } - public clone(data: Data): this { - return this._view.clone(this._data = data) && this || this; + public toJSON(): any { return this.toArray(); } + public clone(data: Data, view: View = this.view.clone(data)): this { + return new (this.constructor as any)(data, view); } public isValid(index: number): boolean { - return this._view.isValid(index); + return this.view.isValid(index); } public get(index: number): T['TValue'] | null { - return this._view.get(index); + return this.view.get(index); } public set(index: number, value: T['TValue']): void { - return this._view.set(index, value); + return this.view.set(index, value); } public toArray(): IterableArrayLike { - return this._view.toArray(); + return this.view.toArray(); } public [Symbol.iterator](): IterableIterator { - return this._view[Symbol.iterator](); + return this.view[Symbol.iterator](); } public concat(...others: Vector[]): this { if ((others = others.filter(Boolean)).length === 0) { return this; } - const { _view: view } = this; + const { view } = this; const vecs = !(view instanceof ChunkedView) ? [this, ...others] : [...view.chunks, ...others]; const offsets = ChunkedData.computeOffsets(vecs); const chunksLength = offsets[offsets.length - 1]; const chunkedData = new ChunkedData(this.type, chunksLength, vecs, 0, -1, offsets); - return new (this.constructor as any)(chunkedData, new ChunkedView(chunkedData)) as this; + return this.clone(chunkedData, new ChunkedView(chunkedData)) as this; } public slice(begin?: number, end?: number): this { let { length } = this; - let size = (this._view as any).size || 1; + let size = (this.view as any).size || 1; let total = length, from = (begin || 0) * size; let to = (typeof end === 'number' ? end : total) * size; if (to < 0) { to = total - (to * -1) % total; } if (from < 0) { from = total - (from * -1) % total; } if (to < from) { [from, to] = [to, from]; } total = !isFinite(total = (to - from)) || total < 0 ? 0 : total; - const newData = this._data.slice(from, Math.min(total, length)); - return new (this.constructor as any)(newData, this._view.clone(newData)) as this; + const slicedData = this.data.slice(from, Math.min(total, length)); + return this.clone(slicedData, this.view.clone(slicedData)) as this; } public acceptTypeVisitor(visitor: TypeVisitor): any { @@ -113,12 +110,12 @@ export class Vector implements VectorLike, View, Vi } export abstract class FlatVector extends Vector { - public get values() { return this._data.values; } + public get values() { return this.data.values; } } export abstract class ListVectorBase extends Vector { - public get values() { return this._data.values; } - public get valueOffsets() { return this._data.valueOffsets; } + public get values() { return this.data.values; } + public get valueOffsets() { return this.data.valueOffsets; } public getValueOffset(index: number) { return this.valueOffsets[index]; } @@ -129,17 +126,18 @@ export abstract class ListVectorBase extend export abstract class NestedVector extends Vector { // @ts-ignore - protected _view: NestedView; + public readonly view: NestedView; public get childData(): Data[] { return this.data.childData; } public getChildAt(index: number) { - return this._view.getChildAt(index); + return this.view.getChildAt(index); } } import { List, Binary, Utf8, Bool, } from './type'; -import { Null, Int, Float, Float16, Decimal, Date_, Time, Timestamp, Interval } from './type'; +import { Null, Int, Float, Decimal, Date_, Time, Timestamp, Interval } from './type'; +import { Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Float16, Float32, Float64 } from './type'; import { Struct, Union, SparseUnion, DenseUnion, FixedSizeBinary, FixedSizeList, Map_, Dictionary } from './type'; import { ChunkedView } from './vector/chunked'; @@ -147,6 +145,7 @@ import { DictionaryView } from './vector/dictionary'; import { ListView, FixedSizeListView, BinaryView, Utf8View } from './vector/list'; import { UnionView, DenseUnionView, NestedView, StructView, MapView } from './vector/nested'; import { FlatView, NullView, BoolView, ValidityView, FixedSizeView, Float16View, DateDayView, DateMillisecondView, IntervalYearMonthView } from './vector/flat'; +import { packBools } from './util/bit'; export class NullVector extends Vector { constructor(data: Data, view: View = new NullView(data)) { @@ -155,12 +154,40 @@ export class NullVector extends Vector { } export class BoolVector extends Vector { + public static from(data: IterableArrayLike) { + return new BoolVector(new BoolData(new Bool(), data.length, null, packBools(data))); + } + public get values() { return this.data.values; } constructor(data: Data, view: View = new BoolView(data)) { super(data, view); } } export class IntVector> extends FlatVector { + public static from(data: Int8Array): IntVector; + public static from(data: Int16Array): IntVector; + public static from(data: Int32Array): IntVector; + public static from(data: Uint8Array): IntVector; + public static from(data: Uint16Array): IntVector; + public static from(data: Uint32Array): IntVector; + public static from(data: Int32Array, is64: true): IntVector; + public static from(data: Uint32Array, is64: true): IntVector; + public static from(data: any, is64?: boolean) { + if (is64 === true) { + return data instanceof Int32Array + ? new IntVector(new FlatData(new Int64(), data.length, null, data)) + : new IntVector(new FlatData(new Uint64(), data.length, null, data)); + } + switch (data.constructor) { + case Int8Array: return new IntVector(new FlatData(new Int8(), data.length, null, data)); + case Int16Array: return new IntVector(new FlatData(new Int16(), data.length, null, data)); + case Int32Array: return new IntVector(new FlatData(new Int32(), data.length, null, data)); + case Uint8Array: return new IntVector(new FlatData(new Uint8(), data.length, null, data)); + case Uint16Array: return new IntVector(new FlatData(new Uint16(), data.length, null, data)); + case Uint32Array: return new IntVector(new FlatData(new Uint32(), data.length, null, data)); + } + throw new TypeError('Unrecognized Int data'); + } static defaultView(data: Data) { return data.type.bitWidth <= 32 ? new FlatView(data) : new FixedSizeView(data, (data.type.bitWidth / 32) | 0); } @@ -170,6 +197,17 @@ export class IntVector> extends FlatVector { } export class FloatVector> extends FlatVector { + public static from(data: Uint16Array): FloatVector; + public static from(data: Float32Array): FloatVector; + public static from(data: Float64Array): FloatVector; + public static from(data: any) { + switch (data.constructor) { + case Uint16Array: return new FloatVector(new FlatData(new Float16(), data.length, null, data)); + case Float32Array: return new FloatVector(new FlatData(new Float32(), data.length, null, data)); + case Float64Array: return new FloatVector(new FlatData(new Float64(), data.length, null, data)); + } + throw new TypeError('Unrecognized Float data'); + } static defaultView(data: Data): FlatView { return data.type.precision !== Precision.HALF ? new FlatView(data) : new Float16View(data as Data); } @@ -266,8 +304,10 @@ export class UnionVector extends Nes } export class DictionaryVector extends Vector> { - public readonly indicies?: Vector; - public readonly dictionary?: Vector; + // @ts-ignore + public readonly indicies: Vector; + // @ts-ignore + public readonly dictionary: Vector; constructor(data: Data>, view: View> = new DictionaryView(data.dictionary, new IntVector(data.indicies))) { super(data as Data, view); if (view instanceof DictionaryView) { @@ -282,8 +322,8 @@ export class DictionaryVector extends Vector(data: Data) => TypeVisitor) => ( diff --git a/js/src/vector/chunked.ts b/js/src/vector/chunked.ts index 32651f0356a6d..24272d2635248 100644 --- a/js/src/vector/chunked.ts +++ b/js/src/vector/chunked.ts @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -import { DataType, TypedArray } from '../type'; -import { View, Vector } from '../vector'; -import { IterableArrayLike } from '../type'; import { ChunkedData } from '../data'; +import { View, Vector } from '../vector'; +import { DataType, TypedArray, IterableArrayLike } from '../type'; export class ChunkedView implements View { public chunks: Vector[]; diff --git a/js/src/vector/nested.ts b/js/src/vector/nested.ts index 8c990fae9d76d..d30e4203f99dd 100644 --- a/js/src/vector/nested.ts +++ b/js/src/vector/nested.ts @@ -44,7 +44,7 @@ export abstract class NestedView implements View { public toArray(): IterableArrayLike { return [...this]; } - public toJSON() { return this.toArray(); } + public toJSON(): any { return this.toArray(); } public toString() { return [...this].map((x) => stringify(x)).join(', '); } @@ -104,7 +104,6 @@ export class UnionView exten } export class DenseUnionView extends UnionView { - // @ts-ignore public valueOffsets: Int32Array; constructor(data: Data, children?: Vector[]) { super(data, children); @@ -128,14 +127,16 @@ export class StructView extends NestedView { return new RowView(self as any, self.children, index); } protected setNested(self: StructView, index: number, value: any): void { - for (let idx = -1, len = self.numChildren; ++idx < len;) { - self.getChildAt(index).set(index, value[idx]); + let idx = -1, len = self.numChildren; + if (!(value instanceof NestedView || value instanceof Vector)) { + while (++idx < len) { self.getChildAt(idx).set(index, value[idx]); } + } else { + while (++idx < len) { self.getChildAt(idx).set(index, value.get(idx)); } } } } export class MapView extends NestedView { - // @ts-ignore public typeIds: { [k: string]: number }; constructor(data: Data, children?: Vector[]) { super(data, children); @@ -146,8 +147,11 @@ export class MapView extends NestedView { return new MapRowView(self as any, self.children, index); } protected setNested(self: MapView, index: number, value: { [k: string]: any }): void { - for (const [key, idx] of Object.entries(self.typeIds)) { - self.getChildAt(idx).set(index, value[key]); + const typeIds = self.typeIds as any; + if (!(value instanceof NestedView || value instanceof Vector)) { + for (const key in typeIds) { self.getChildAt(typeIds[key]).set(index, value[key]); } + } else { + for (const key in typeIds) { self.getChildAt(typeIds[key]).set(index, value.get(key as any)); } } } } @@ -166,19 +170,30 @@ export class RowView extends UnionView { const child = self.getChildAt(index); return child ? child.get(self.rowIndex) : null; } - protected setChildValue(self: RowView, index: number, value: any, _typeIds: Int8Array, _valueOffsets?: any): any | null { + protected setChildValue(self: RowView, index: number, value: any, _typeIds: any, _valueOffsets?: any): any | null { const child = self.getChildAt(index); return child ? child.set(self.rowIndex, value) : null; } } export class MapRowView extends RowView { - protected getChildValue(self: MapRowView, index: number, typeIds: any, _valueOffsets: any): any | null { - const child = self.getChildAt(typeIds[index]); + // @ts-ignore + public typeIds: any; + public toJSON() { + const get = this.getChildValue; + const result = {} as { [k: string]: any }; + const typeIds = this.typeIds as { [k: string]: number }; + for (const name in typeIds) { + result[name] = get(this, name, typeIds, null); + } + return result; + } + protected getChildValue(self: MapRowView, key: any, typeIds: any, _valueOffsets: any): any | null { + const child = self.getChildAt(typeIds[key]); return child ? child.get(self.rowIndex) : null; } - protected setChildValue(self: MapRowView, index: number, value: any, typeIds: Int8Array, _valueOffsets?: any): any | null { - const child = self.getChildAt(typeIds[index]); + protected setChildValue(self: MapRowView, key: any, value: any, typeIds: any, _valueOffsets?: any): any | null { + const child = self.getChildAt(typeIds[key]); return child ? child.set(self.rowIndex, value) : null; } } diff --git a/js/test/data/tables/generate.py b/js/test/data/tables/generate.py new file mode 100644 index 0000000000000..da19c6a0728c0 --- /dev/null +++ b/js/test/data/tables/generate.py @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import random +import numpy as np +import pandas as pd + + +cities = [u'Charlottesville', u'New York', u'San Francisco', u'Seattle', u'Terre Haute', u'Washington, DC'] + +def generate_batch(batch_len): + return pa.RecordBatch.from_arrays([ + pa.Array.from_pandas(pd.Series(np.random.uniform(-90,90,batch_len), dtype="float32")), + pa.Array.from_pandas(pd.Series(np.random.uniform(-180,180,batch_len), dtype="float32")), + pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities)), + pa.Array.from_pandas(pd.Categorical((random.choice(cities) for i in range(batch_len)), cities)) + ], ['lat', 'lng', 'origin', 'destination']) + +def write_record_batches(fd, batch_len, num_batches): + writer = pa.ipc.RecordBatchStreamWriter(fd, generate_batch(1).schema) + for batch in range(num_batches): + writer.write_batch(generate_batch(batch_len)) + + writer.close() + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='number of batches') + parser.add_argument('-n', '--num-batches', help='number of batches', type=int, default=10) + parser.add_argument('-b', '--batch-size', help='size of each batch', type=int, default=100000) + + args = parser.parse_args() + + print "Writing {} {}-element batches to '{}'".format(args.num_batches, args.batch_size, args.filename) + with open(args.filename, 'w') as fd: + write_record_batches(fd, args.batch_size, args.num_batches) diff --git a/js/test/integration/validate-tests.ts b/js/test/integration/validate-tests.ts index 16a36f827a4e6..c9ce6ba8d7274 100644 --- a/js/test/integration/validate-tests.ts +++ b/js/test/integration/validate-tests.ts @@ -104,6 +104,7 @@ describe(`Integration`, () => { function testReaderIntegration() { test(`json and arrow buffers report the same values`, () => { + debugger; expect.hasAssertions(); const jsonRecordBatches = toArray(read(jsonData)); const binaryRecordBatches = toArray(read(arrowBuffers)); @@ -119,6 +120,7 @@ function testReaderIntegration() { function testTableFromBuffersIntegration() { test(`json and arrow buffers report the same values`, () => { + debugger; expect.hasAssertions(); const jsonTable = Table.from(jsonData); const binaryTable = Table.from(arrowBuffers); diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts new file mode 100644 index 0000000000000..f5c5a6ea6e0c3 --- /dev/null +++ b/js/test/unit/table-tests.ts @@ -0,0 +1,391 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import Arrow, { +} from '../Arrow'; + +const { + col, + Table, +} = Arrow; + +describe(`Table`, () => { + describe(`single record batch`, () => { + const table = Table.from({ + 'schema': { + 'fields': [ + { + 'name': 'f32', + 'type': { + 'name': 'floatingpoint', + 'precision': 'SINGLE' + }, + 'nullable': false, + 'children': [], + }, + { + 'name': 'i32', + 'type': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 32 + }, + 'nullable': false, + 'children': [], + }, + { + 'name': 'dictionary', + 'type': { + 'name': 'utf8' + }, + 'nullable': false, + 'children': [], + 'dictionary': { + 'id': 0, + 'indexType': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 8 + }, + 'isOrdered': false + } + } + ] + }, + 'dictionaries': [{ + 'id': 0, + 'data': { + 'count': 3, + 'columns': [ + { + 'name': 'DICT0', + 'count': 3, + 'VALIDITY': [], + 'OFFSET': [ + 0, + 1, + 2, + 3 + ], + 'DATA': [ + 'a', + 'b', + 'c', + ] + } + ] + } + }], + 'batches': [{ + 'count': 7, + 'columns': [ + { + 'name': 'f32', + 'count': 7, + 'VALIDITY': [], + 'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] + }, + { + 'name': 'i32', + 'count': 7, + 'VALIDITY': [], + 'DATA': [-1, 1, -1, 1, -1, 1, -1] + }, + { + 'name': 'dictionary', + 'count': 7, + 'VALIDITY': [], + 'DATA': [0, 1, 2, 0, 1, 2, 0] + } + ] + }] + }); + + // Wrap floating point values in a Float32Array and take them back out to + // make sure that equality checks will pass + const values = [ + [new Float32Array([-0.3])[0], -1, 'a'], + [new Float32Array([-0.2])[0], 1, 'b'], + [new Float32Array([-0.1])[0], -1, 'c'], + [new Float32Array([ 0 ])[0], 1, 'a'], + [new Float32Array([ 0.1])[0], -1, 'b'], + [new Float32Array([ 0.2])[0], 1, 'c'], + [new Float32Array([ 0.3])[0], -1, 'a'] + ]; + test(`has the correct length`, () => { + expect(table.numRows).toEqual(values.length); + }); + test(`gets expected values`, () => { + for (let i = -1; ++i < values.length;) { + expect(table.get(i).toArray()).toEqual(values[i]); + } + }); + test(`iterates expected values`, () => { + let i = 0; + for (let row of table) { + expect(row.toArray()).toEqual(values[i++]); + } + }); + test(`scans expected values`, () => { + let expected_idx = 0; + table.scan((idx, batch) => { + expect(batch.columns.map((c) => c.get(idx))).toEqual(values[expected_idx++]); + }); + }); + test(`count() returns the correct length`, () => { + expect(table.count()).toEqual(values.length); + }); + test(`filter on f32 >= 0 returns the correct length`, () => { + expect(table.filter(col('f32').gteq(0)).count()).toEqual(4); + }); + test(`filter on i32 <= 0 returns the correct length`, () => { + expect(table.filter(col('i32').lteq(0)).count()).toEqual(4); + }); + test(`filter on dictionary == 'a' returns the correct length`, () => { + expect(table.filter(col('dictionary').eq('a')).count()).toEqual(3); + }); + test(`countBy on dictionary returns the correct counts`, () => { + // Make sure countBy works both with and without the Col wrapper + // class + expect(table.countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 3, + 'b': 2, + 'c': 2, + }); + expect(table.countBy('dictionary').asJSON()).toEqual({ + 'a': 3, + 'b': 2, + 'c': 2, + }); + }); + test(`countBy on dictionary with filter returns the correct counts`, () => { + expect(table.filter(col('i32').eq(1)).countBy('dictionary').asJSON()).toEqual({ + 'a': 1, + 'b': 1, + 'c': 1, + }); + }); + test(`countBy on non dictionary column throws error`, () => { + expect(() => { table.countBy('i32'); }).toThrow(); + }); + }); + describe(`multiple record batches`, () => { + const table = Table.from({ + 'schema': { + 'fields': [ + { + 'name': 'f32', + 'type': { + 'name': 'floatingpoint', + 'precision': 'SINGLE' + }, + 'nullable': false, + 'children': [], + }, + { + 'name': 'i32', + 'type': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 32 + }, + 'nullable': false, + 'children': [], + }, + { + 'name': 'dictionary', + 'type': { + 'name': 'utf8' + }, + 'nullable': false, + 'children': [], + 'dictionary': { + 'id': 0, + 'indexType': { + 'name': 'int', + 'isSigned': true, + 'bitWidth': 8 + }, + 'isOrdered': false + } + } + ] + }, + 'dictionaries': [{ + 'id': 0, + 'data': { + 'count': 3, + 'columns': [ + { + 'name': 'DICT0', + 'count': 3, + 'VALIDITY': [], + 'OFFSET': [ + 0, + 1, + 2, + 3 + ], + 'DATA': [ + 'a', + 'b', + 'c', + ] + } + ] + } + }], + 'batches': [{ + 'count': 3, + 'columns': [ + { + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-0.3, -0.2, -0.1] + }, + { + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-1, 1, -1] + }, + { + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] + } + ] + }, { + 'count': 3, + 'columns': [ + { + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 0.1, 0.2] + }, + { + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [1, -1, 1] + }, + { + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] + } + ] + }, { + 'count': 3, + 'columns': [ + { + 'name': 'f32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0.3, 0.2, 0.1] + }, + { + 'name': 'i32', + 'count': 3, + 'VALIDITY': [], + 'DATA': [-1, 1, -1] + }, + { + 'name': 'dictionary', + 'count': 3, + 'VALIDITY': [], + 'DATA': [0, 1, 2] + } + ] + }] + }); + + // Wrap floating point values in a Float32Array and take them back out to + // make sure that equality checks will pass + const values = [ + [new Float32Array([-0.3])[0], -1, 'a'], + [new Float32Array([-0.2])[0], 1, 'b'], + [new Float32Array([-0.1])[0], -1, 'c'], + [new Float32Array([ 0 ])[0], 1, 'a'], + [new Float32Array([ 0.1])[0], -1, 'b'], + [new Float32Array([ 0.2])[0], 1, 'c'], + [new Float32Array([ 0.3])[0], -1, 'a'], + [new Float32Array([ 0.2])[0], 1, 'b'], + [new Float32Array([ 0.1])[0], -1, 'c'], + ]; + test(`has the correct length`, () => { + expect(table.numRows).toEqual(values.length); + }); + test(`gets expected values`, () => { + for (let i = -1; ++i < values.length;) { + expect(table.get(i).toArray()).toEqual(values[i]); + } + }); + test(`iterates expected values`, () => { + let i = 0; + for (let row of table) { + expect(row.toArray()).toEqual(values[i++]); + } + }); + test(`scans expected values`, () => { + let expected_idx = 0; + table.scan((idx, batch) => { + expect(batch.columns.map((c) => c.get(idx))).toEqual(values[expected_idx++]); + }); + }); + test(`count() returns the correct length`, () => { + expect(table.count()).toEqual(values.length); + }); + test(`filter on f32 >= 0 returns the correct length`, () => { + expect(table.filter(col('f32').gteq(0)).count()).toEqual(6); + }); + test(`filter on i32 <= 0 returns the correct length`, () => { + expect(table.filter(col('i32').lteq(0)).count()).toEqual(5); + }); + test(`filter on dictionary == 'a' returns the correct length`, () => { + expect(table.filter(col('dictionary').eq('a')).count()).toEqual(3); + }); + test(`countBy on dictionary returns the correct counts`, () => { + // Make sure countBy works both with and without the Col wrapper + // class + expect(table.countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 3, + 'b': 3, + 'c': 3, + }); + expect(table.countBy('dictionary').asJSON()).toEqual({ + 'a': 3, + 'b': 3, + 'c': 3, + }); + }); + test(`countBy on dictionary with filter returns the correct counts`, () => { + expect(table.filter(col('i32').eq(1)).countBy(col('dictionary')).asJSON()).toEqual({ + 'a': 1, + 'b': 2, + 'c': 1, + }); + }); + test(`countBy on non dictionary column throws error`, () => { + expect(() => { table.countBy('i32'); }).toThrow(); + }); + }); +}); diff --git a/js/test/unit/vector-tests.ts b/js/test/unit/vector-tests.ts index b2878ddc2ff20..81676b003e399 100644 --- a/js/test/unit/vector-tests.ts +++ b/js/test/unit/vector-tests.ts @@ -93,32 +93,33 @@ describe(`BoolVector`, () => { v.set(7, false); validate(expected1); }); - // test(`packs 0 values`, () => { - // expect(BoolVector.pack([])).toEqual( - // new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])); - // }); - // test(`packs 3 values`, () => { - // expect(BoolVector.pack([ - // true, false, true - // ])).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); - // }); - // test(`packs 8 values`, () => { - // expect(BoolVector.pack([ - // true, true, false, true, true, false, false, false - // ])).toEqual(new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])); - // }); - // test(`packs 25 values`, () => { - // expect(BoolVector.pack([ - // true, true, false, true, true, false, false, false, - // false, false, false, true, true, false, true, true, - // false - // ])).toEqual(new Uint8Array([27, 216, 0, 0, 0, 0, 0, 0])); - // }); - // test(`from with boolean Array packs values`, () => { - // expect(new BoolVector({ - // data: BoolVector.pack([true, false, true]) - // }).slice()).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); - // }); + test(`packs 0 values`, () => { + expect(BoolVector.from([]).values).toEqual( + new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])); + }); + test(`packs 3 values`, () => { + expect(BoolVector.from([ + true, false, true + ]).values).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); + }); + test(`packs 8 values`, () => { + expect(BoolVector.from([ + true, true, false, true, true, false, false, false + ]).values).toEqual(new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])); + }); + test(`packs 25 values`, () => { + expect(BoolVector.from([ + true, true, false, true, true, false, false, false, + false, false, false, true, true, false, true, true, + false + ]).values).toEqual(new Uint8Array([27, 216, 0, 0, 0, 0, 0, 0])); + }); + test(`from with boolean Array packs values`, () => { + expect(BoolVector + .from([true, false, true]) + .slice().values + ).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); + }); }); describe('Float16Vector', () => {