diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js index 2976d0ad45d09..c42b3fc323321 100644 --- a/js/gulp/package-task.js +++ b/js/gulp/package-task.js @@ -45,10 +45,11 @@ const createMainPackageJson = (target, format) => (orig) => ({ ...createTypeScriptPackageJson(target, format)(orig), name: npmPkgName, main: mainExport, + types: `${mainExport}.d.ts`, module: `${mainExport}.mjs`, dist: `${mainExport}.es5.min.js`, [`dist:es2015`]: `${mainExport}.es2015.min.js`, - [`@std/esm`]: { esm: `mjs` } + [`@std/esm`]: { esm: `mjs`, warnings: false, sourceMap: true } }); const createTypeScriptPackageJson = (target, format) => (orig) => ({ @@ -63,18 +64,20 @@ const createTypeScriptPackageJson = (target, format) => (orig) => ({ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => conditionallyAddStandardESMEntry(target, format)( - packageJSONFields.reduce( - (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), - { name: `${npmOrgName}/${packageName(target, format)}`, - version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, - dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined } - ) + packageJSONFields.reduce( + (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), + { + name: `${npmOrgName}/${packageName(target, format)}`, + version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, + dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined + } + ) ) ); const conditionallyAddStandardESMEntry = (target, format) => (packageJSON) => ( - format !== `esm` - ? packageJSON - : { ...packageJSON, [`@std/esm`]: { esm: `js` } } + format !== `esm` && format !== `cls` + ? packageJSON + : { ...packageJSON, [`@std/esm`]: { esm: `js`, warnings: false, sourceMap: true } } ); \ No newline at end of file diff --git a/js/gulp/typescript-task.js b/js/gulp/typescript-task.js index 8b755cf7f1624..c42357adb2f75 100644 --- a/js/gulp/typescript-task.js +++ b/js/gulp/typescript-task.js @@ -34,7 +34,7 @@ const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target const tsProject = ts.createProject(path.join(`tsconfig`, tsconfigFile), { typescript: require(`typescript`) }); const { stream: { js, dts } } = observableFromStreams( tsProject.src(), sourcemaps.init(), - tsProject(ts.reporter.fullReporter(true)) + tsProject(ts.reporter.defaultReporter()) ); const writeDTypes = observableFromStreams(dts, gulp.dest(out)); const writeJS = observableFromStreams(js, sourcemaps.write(), gulp.dest(out)); @@ -52,12 +52,12 @@ function maybeCopyRawJSArrowFormatFiles(target, format) { return Observable.empty(); } return Observable.defer(async () => { - const outFormatDir = path.join(targetDir(target, format), `format`, `fb`); + const outFormatDir = path.join(targetDir(target, format), `fb`); await del(path.join(outFormatDir, '*.js')); await observableFromStreams( - gulp.src(path.join(`src`, `format`, `fb`, `*_generated.js`)), + gulp.src(path.join(`src`, `fb`, `*_generated.js`)), gulpRename((p) => { p.basename = p.basename.replace(`_generated`, ``); }), gulp.dest(outFormatDir) ).toPromise(); }); -} \ No newline at end of file +} diff --git a/js/gulp/uglify-task.js b/js/gulp/uglify-task.js index 5c605cb7882bd..8e6a68a1aae52 100644 --- a/js/gulp/uglify-task.js +++ b/js/gulp/uglify-task.js @@ -29,7 +29,7 @@ const webpack = require(`webpack`); const { memoizeTask } = require('./memoize-task'); const { Observable, ReplaySubject } = require('rxjs'); const UglifyJSPlugin = require(`uglifyjs-webpack-plugin`); -const esmRequire = require(`@std/esm`)(module, { cjs: true, esm: `js` }); +const esmRequire = require(`@std/esm`)(module, { cjs: true, esm: `js`, warnings: false }); const uglifyTask = ((cache, commonConfig) => memoizeTask(cache, function uglifyJS(target, format) { diff --git a/js/gulp/util.js b/js/gulp/util.js index ba6ebece51bba..679588d44205c 100644 --- a/js/gulp/util.js +++ b/js/gulp/util.js @@ -108,14 +108,13 @@ function targetDir(target, format) { function logAndDie(e) { if (e) { - console.error(e); process.exit(1); } } function observableFromStreams(...streams) { - const pumped = streams.length <= 1 ? streams[0] - : pump(...streams, logAndDie); + if (streams.length <= 0) { return Observable.empty(); } + const pumped = streams.length <= 1 ? streams[0] : pump(...streams, logAndDie); const fromEvent = Observable.fromEvent.bind(null, pumped); const streamObs = fromEvent(`data`) .merge(fromEvent(`error`).flatMap((e) => Observable.throw(e))) diff --git a/js/package.json b/js/package.json index d68e7a6279e61..deae744521df6 100644 --- a/js/package.json +++ b/js/package.json @@ -51,15 +51,16 @@ ], "dependencies": { "@types/text-encoding-utf-8": "1.0.1", - "command-line-args": "4.0.7", - "command-line-usage": "4.0.2", + "command-line-args": "5.0.0", + "command-line-usage": "4.1.0", "flatbuffers": "trxcllnt/flatbuffers-esm", "json-bignum": "0.0.3", "text-encoding-utf-8": "^1.0.2", + "ts-node": "4.1.0", "tslib": "1.8.1" }, "devDependencies": { - "@std/esm": "0.19.1", + "@std/esm": "0.19.6", "@types/flatbuffers": "1.6.5", "@types/glob": "5.0.34", "@types/jest": "22.0.1", @@ -79,10 +80,10 @@ "gulp-transform-js-ast": "1.0.2", "gulp-typescript": "3.2.3", "ix": "2.3.4", - "jest": "22.0.5", + "jest": "22.1.2", "jest-environment-node-debug": "2.0.0", "json": "9.0.6", - "lerna": "2.6.0", + "lerna": "2.7.1", "lint-staged": "6.0.0", "merge2": "1.2.1", "mkdirp": "0.5.1", @@ -100,6 +101,9 @@ "webpack": "3.10.0", "xml2js": "0.4.19" }, + "@std/esm": { + "warnings": false + }, "lint-staged": { "*.@(ts)": [ "tslint --fix", @@ -135,7 +139,8 @@ ".(js|jsx)": "./node_modules/babel-jest/build/index.js" }, "transformIgnorePatterns": [ - "/node_modules/", "/(es2015|esnext)\/umd/" + "/node_modules/", + "/(es2015|esnext)/umd/" ], "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$" } diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts new file mode 100644 index 0000000000000..8001f524a42fb --- /dev/null +++ b/js/src/Arrow.externs.ts @@ -0,0 +1,378 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * @fileoverview Closure Compiler externs for Arrow + * @externs + * @suppress {duplicate,checkTypes} + */ +/** @type {symbol} */ +Symbol.iterator; +/** @type {symbol} */ +Symbol.asyncIterator; + +let Table = function() {}; +/** @type {?} */ +( Table).from; +/** @type {?} */ +( Table).fromAsync; +/** @type {?} */ +( Table).empty; +/** @type {?} */ +Table.prototype.columns; +/** @type {?} */ +Table.prototype.length; +/** @type {?} */ +Table.prototype.select; +/** @type {?} */ +Table.prototype.toString; + +let Vector = function() {}; +/** @type {?} */ +( Vector).create; +/** @type {?} */ +Vector.prototype.data; +/** @type {?} */ +Vector.prototype.type; +/** @type {?} */ +Vector.prototype.length; +/** @type {?} */ +Vector.prototype.nullCount; +/** @type {?} */ +Vector.prototype.nullBitmap; +/** @type {?} */ +Vector.prototype.isValid; +/** @type {?} */ +Vector.prototype.get; +/** @type {?} */ +Vector.prototype.set; +/** @type {?} */ +Vector.prototype.setData; +/** @type {?} */ +Vector.prototype.toArray; +/** @type {?} */ +Vector.prototype.concat; +/** @type {?} */ +Vector.prototype.slice; +/** @type {?} */ +Vector.prototype.acceptTypeVisitor; + +let DataType = function() {}; +/** @type {?} */ +( DataType).isNull; +/** @type {?} */ +( DataType).isInt; +/** @type {?} */ +( DataType).isFloat; +/** @type {?} */ +( DataType).isBinary; +/** @type {?} */ +( DataType).isUtf8; +/** @type {?} */ +( DataType).isBool; +/** @type {?} */ +( DataType).isDecimal; +/** @type {?} */ +( DataType).isDate; +/** @type {?} */ +( DataType).isTime; +/** @type {?} */ +( DataType).isTimestamp; +/** @type {?} */ +( DataType).isInterval; +/** @type {?} */ +( DataType).isList; +/** @type {?} */ +( DataType).isStruct; +/** @type {?} */ +( DataType).isUnion; +/** @type {?} */ +( DataType).isDenseUnion; +/** @type {?} */ +( DataType).isSparseUnion; +/** @type {?} */ +( DataType).isFixedSizeBinary; +/** @type {?} */ +( DataType).isFixedSizeList; +/** @type {?} */ +( DataType).isMap; +/** @type {?} */ +( DataType).isDictionary; + +let BaseData = function() {}; +/** @type {?} */ +BaseData.prototype.type; +/** @type {?} */ +BaseData.prototype.clone; +/** @type {?} */ +BaseData.prototype.slice; +/** @type {?} */ +BaseData.prototype.length; +/** @type {?} */ +BaseData.prototype.offset; +/** @type {?} */ +BaseData.prototype.typeId; +/** @type {?} */ +BaseData.prototype.childData; +/** @type {?} */ +BaseData.prototype.nullBitmap; +/** @type {?} */ +BaseData.prototype.nullCount; + +let FlatData = function() {}; +/** @type {?} */ +FlatData.prototype.values; + +let FlatListData = function() {}; +/** @type {?} */ +FlatListData.prototype.values; +/** @type {?} */ +FlatListData.prototype.valueOffsets; + +let DictionaryData = function() {}; +/** @type {?} */ +DictionaryData.prototype.indicies; +/** @type {?} */ +DictionaryData.prototype.dictionary; + +let ListData = function() {}; +/** @type {?} */ +ListData.prototype.values; +/** @type {?} */ +ListData.prototype.valueOffsets; + +let UnionData = function() {}; +/** @type {?} */ +UnionData.prototype.typeIds; + +let DenseUnionData = function() {}; +/** @type {?} */ +DenseUnionData.prototype.valueOffsets; + +let ChunkedData = function() {}; +/** @type {?} */ +( ChunkedData).computeOffsets; + +let FlatVector = function() {}; +/** @type {?} */ +FlatVector.prototype.values; + +let ListVectorBase = function() {}; +/** @type {?} */ +ListVectorBase.prototype.values; +/** @type {?} */ +ListVectorBase.prototype.valueOffsets; +/** @type {?} */ +ListVectorBase.prototype.getValueOffset; +/** @type {?} */ +ListVectorBase.prototype.getValueLength; + +let NestedVector = function() {}; +/** @type {?} */ +NestedVector.prototype.childData; +/** @type {?} */ +NestedVector.prototype.getChildAt; + +let DictionaryVector = function() {}; +/** @type {?} */ +DictionaryVector.prototype.getKey; +/** @type {?} */ +DictionaryVector.prototype.getValue; + +let FlatView = function() {}; +/** @type {?} */ +FlatView.prototype.get; +/** @type {?} */ +FlatView.prototype.isValid; +/** @type {?} */ +FlatView.prototype.toArray; +/** @type {?} */ +FlatView.prototype.set; +/** @type {?} */ +FlatView.prototype.setData; + +let NullView = function() {}; +/** @type {?} */ +NullView.prototype.get; +/** @type {?} */ +NullView.prototype.isValid; +/** @type {?} */ +NullView.prototype.toArray; +/** @type {?} */ +NullView.prototype.set; +/** @type {?} */ +NullView.prototype.setData; + +let BoolView = function() {}; +/** @type {?} */ +BoolView.prototype.get; +/** @type {?} */ +BoolView.prototype.isValid; +/** @type {?} */ +BoolView.prototype.toArray; +/** @type {?} */ +BoolView.prototype.set; +/** @type {?} */ +BoolView.prototype.setData; + +let ValidityView = function() {}; +/** @type {?} */ +ValidityView.prototype.get; +/** @type {?} */ +ValidityView.prototype.isValid; +/** @type {?} */ +ValidityView.prototype.toArray; +/** @type {?} */ +ValidityView.prototype.set; +/** @type {?} */ +ValidityView.prototype.setData; + +let DictionaryView = function() {}; +/** @type {?} */ +DictionaryView.prototype.get; +/** @type {?} */ +DictionaryView.prototype.isValid; +/** @type {?} */ +DictionaryView.prototype.toArray; +/** @type {?} */ +DictionaryView.prototype.set; +/** @type {?} */ +DictionaryView.prototype.setData; + +let ListViewBase = function() {}; +/** @type {?} */ +ListViewBase.prototype.get; +/** @type {?} */ +ListViewBase.prototype.isValid; +/** @type {?} */ +ListViewBase.prototype.toArray; +/** @type {?} */ +ListViewBase.prototype.set; +/** @type {?} */ +ListViewBase.prototype.setData; + +let NestedView = function() {}; +/** @type {?} */ +NestedView.prototype.get; +/** @type {?} */ +NestedView.prototype.isValid; +/** @type {?} */ +NestedView.prototype.toArray; +/** @type {?} */ +NestedView.prototype.set; +/** @type {?} */ +NestedView.prototype.setData; + +let ChunkedView = function() {}; +/** @type {?} */ +ChunkedView.prototype.get; +/** @type {?} */ +ChunkedView.prototype.isValid; +/** @type {?} */ +ChunkedView.prototype.toArray; +/** @type {?} */ +ChunkedView.prototype.set; +/** @type {?} */ +ChunkedView.prototype.setData; + +let TypeVisitor = function() {}; +/** @type {?} */ +( TypeVisitor).visitTypeInline; +/** @type {?} */ +TypeVisitor.prototype.visit; +/** @type {?} */ +TypeVisitor.prototype.visitMany; +/** @type {?} */ +TypeVisitor.prototype.visitNull; +/** @type {?} */ +TypeVisitor.prototype.visitBool; +/** @type {?} */ +TypeVisitor.prototype.visitInt; +/** @type {?} */ +TypeVisitor.prototype.visitFloat; +/** @type {?} */ +TypeVisitor.prototype.visitUtf8; +/** @type {?} */ +TypeVisitor.prototype.visitBinary; +/** @type {?} */ +TypeVisitor.prototype.visitFixedSizeBinary; +/** @type {?} */ +TypeVisitor.prototype.visitDate; +/** @type {?} */ +TypeVisitor.prototype.visitTimestamp; +/** @type {?} */ +TypeVisitor.prototype.visitTime; +/** @type {?} */ +TypeVisitor.prototype.visitDecimal; +/** @type {?} */ +TypeVisitor.prototype.visitList; +/** @type {?} */ +TypeVisitor.prototype.visitStruct; +/** @type {?} */ +TypeVisitor.prototype.visitUnion; +/** @type {?} */ +TypeVisitor.prototype.visitDictionary; +/** @type {?} */ +TypeVisitor.prototype.visitInterval; +/** @type {?} */ +TypeVisitor.prototype.visitFixedSizeList; +/** @type {?} */ +TypeVisitor.prototype.visitMap; + +let VectorVisitor = function() {}; +/** @type {?} */ +( VectorVisitor).visitTypeInline; +/** @type {?} */ +VectorVisitor.prototype.visit; +/** @type {?} */ +VectorVisitor.prototype.visitMany; +/** @type {?} */ +VectorVisitor.prototype.visitNullVector; +/** @type {?} */ +VectorVisitor.prototype.visitBoolVector; +/** @type {?} */ +VectorVisitor.prototype.visitIntVector; +/** @type {?} */ +VectorVisitor.prototype.visitFloatVector; +/** @type {?} */ +VectorVisitor.prototype.visitUtf8Vector; +/** @type {?} */ +VectorVisitor.prototype.visitBinaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitFixedSizeBinaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitDateVector; +/** @type {?} */ +VectorVisitor.prototype.visitTimestampVector; +/** @type {?} */ +VectorVisitor.prototype.visitTimeVector; +/** @type {?} */ +VectorVisitor.prototype.visitDecimalVector; +/** @type {?} */ +VectorVisitor.prototype.visitListVector; +/** @type {?} */ +VectorVisitor.prototype.visitStructVector; +/** @type {?} */ +VectorVisitor.prototype.visitUnionVector; +/** @type {?} */ +VectorVisitor.prototype.visitDictionaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitIntervalVector; +/** @type {?} */ +VectorVisitor.prototype.visitFixedSizeListVector; +/** @type {?} */ +VectorVisitor.prototype.visitMapVector; \ No newline at end of file diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts new file mode 100644 index 0000000000000..c988e0cf8893f --- /dev/null +++ b/js/src/Arrow.ts @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as type_ from './type'; +import * as data from './data'; +import * as vector from './vector'; + +import { Table, RecordBatch } from './table'; +import { Uint64, Int64, Int128 } from './util/int'; +import { View, Vector, VectorLike, } from './vector'; +import { read, readAsync } from './ipc/reader/arrow'; +import { Schema, Field, DataType, Type } from './type'; + +// closure compiler always erases static method names: +// https://github.com/google/closure-compiler/issues/1776 +// set them via string indexers to save them from the mangler +Table['from'] = Table.from; +Table['fromAsync'] = Table.fromAsync; +Table['empty'] = Table.empty; +Vector['create'] = Vector.create; + +export import IntBitWidth = type_.IntBitWidth; +export import TimeBitWidth = type_.TimeBitWidth; + +export import TypedArray = type_.TypedArray; +export import TypedArrayConstructor = type_.TypedArrayConstructor; + +export { read, readAsync }; +export { View, VectorLike }; +export { Uint64, Int64, Int128 }; +export { type_ as type, data, vector }; +export { Table, Field, Schema, Vector, DataType, RecordBatch, Type }; + +/* These exports are needed for the closure and uglify umd targets */ +try { + let Arrow: any = eval('exports'); + if (Arrow && typeof Arrow === 'object') { + // string indexers tell closure and uglify not to rename these properties + Arrow['data'] = data; + Arrow['type'] = type_; + Arrow['read'] = read; + Arrow['readAsync'] = readAsync; + Arrow['Int64'] = Int64; + Arrow['Uint64'] = Uint64; + Arrow['Int128'] = Int128; + + Arrow['Table'] = Table; + Arrow['Field'] = Field; + Arrow['Schema'] = Schema; + Arrow['Vector'] = Vector; + Arrow['DataType'] = DataType; + Arrow['RecordBatch'] = RecordBatch; + + Arrow['data']['BaseData'] = data.BaseData; + Arrow['data']['FlatData'] = data.FlatData; + Arrow['data']['BoolData'] = data.BoolData; + Arrow['data']['FlatListData'] = data.FlatListData; + Arrow['data']['DictionaryData'] = data.DictionaryData; + Arrow['data']['NestedData'] = data.NestedData; + Arrow['data']['ListData'] = data.ListData; + Arrow['data']['UnionData'] = data.UnionData; + Arrow['data']['SparseUnionData'] = data.SparseUnionData; + Arrow['data']['DenseUnionData'] = data.DenseUnionData; + + Arrow['type']['DataType'] = type_.DataType; + Arrow['type']['Null'] = type_.Null; + Arrow['type']['Bool'] = type_.Bool; + Arrow['type']['Decimal'] = type_.Decimal; + Arrow['type']['Int'] = type_.Int; + Arrow['type']['Int8'] = type_.Int8; + Arrow['type']['Int16'] = type_.Int16; + Arrow['type']['Int32'] = type_.Int32; + Arrow['type']['Int64'] = type_.Int64; + Arrow['type']['Uint8'] = type_.Uint8; + Arrow['type']['Uint16'] = type_.Uint16; + Arrow['type']['Uint32'] = type_.Uint32; + Arrow['type']['Uint64'] = type_.Uint64; + Arrow['type']['Float'] = type_.Float; + Arrow['type']['Float16'] = type_.Float16; + Arrow['type']['Float32'] = type_.Float32; + Arrow['type']['Float64'] = type_.Float64; + Arrow['type']['Date_'] = type_.Date_; + Arrow['type']['Time'] = type_.Time; + Arrow['type']['Timestamp'] = type_.Timestamp; + Arrow['type']['Interval'] = type_.Interval; + Arrow['type']['Binary'] = type_.Binary; + Arrow['type']['FixedSizeBinary'] = type_.FixedSizeBinary; + Arrow['type']['Utf8'] = type_.Utf8; + Arrow['type']['List'] = type_.List; + Arrow['type']['FixedSizeList'] = type_.FixedSizeList; + Arrow['type']['Struct'] = type_.Struct; + Arrow['type']['Union'] = type_.Union; + Arrow['type']['Map_'] = type_.Map_; + Arrow['type']['Dictionary'] = type_.Dictionary; + + Arrow['vector']['Vector'] = vector.Vector; + Arrow['vector']['FlatVector'] = vector.FlatVector; + Arrow['vector']['ListVectorBase'] = vector.ListVectorBase; + Arrow['vector']['NestedVector'] = vector.NestedVector; + Arrow['vector']['NullVector'] = vector.NullVector; + Arrow['vector']['BoolVector'] = vector.BoolVector; + Arrow['vector']['IntVector'] = vector.IntVector; + Arrow['vector']['FloatVector'] = vector.FloatVector; + Arrow['vector']['DateVector'] = vector.DateVector; + Arrow['vector']['DecimalVector'] = vector.DecimalVector; + Arrow['vector']['TimeVector'] = vector.TimeVector; + Arrow['vector']['TimestampVector'] = vector.TimestampVector; + Arrow['vector']['IntervalVector'] = vector.IntervalVector; + Arrow['vector']['BinaryVector'] = vector.BinaryVector; + Arrow['vector']['FixedSizeBinaryVector'] = vector.FixedSizeBinaryVector; + Arrow['vector']['Utf8Vector'] = vector.Utf8Vector; + Arrow['vector']['ListVector'] = vector.ListVector; + Arrow['vector']['FixedSizeListVector'] = vector.FixedSizeListVector; + Arrow['vector']['MapVector'] = vector.MapVector; + Arrow['vector']['StructVector'] = vector.StructVector; + Arrow['vector']['UnionVector'] = vector.UnionVector; + Arrow['vector']['DictionaryVector'] = vector.DictionaryVector; + } +} catch (e) { /* not the UMD bundle */ } +/* end umd exports */ diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 01ef0b848ce75..86536b5a6ffee 100644 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -1,4 +1,4 @@ -// #! /usr/bin/env node +#! /usr/bin/env node // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file @@ -19,11 +19,9 @@ /* tslint:disable */ +import * as fs from 'fs'; import * as Arrow from '../Arrow'; -(function() { - -const fs = require('fs'); const { parse } = require('json-bignum'); const optionList = [ { @@ -36,12 +34,13 @@ const optionList = [ { type: String, name: 'file', alias: 'f', + optional: false, multiple: true, description: 'The Arrow file to read' } ]; const argv = require(`command-line-args`)(optionList, { partial: true }); -const files = [argv.file, ...(argv._unknown || [])].filter(Boolean); +const files = [...argv.file, ...(argv._unknown || [])].filter(Boolean); if (!files.length) { console.log(require('command-line-usage')([ @@ -85,51 +84,15 @@ if (!files.length) { } files.forEach((source) => { - let table: any, input = fs.readFileSync(source); + // debugger; + let table: Arrow.Table, input = fs.readFileSync(source); try { - table = Arrow.Table.from([input]); + table = Arrow.Table.from(input); } catch (e) { table = Arrow.Table.from(parse(input + '')); } if (argv.schema && argv.schema.length) { table = table.select(...argv.schema); } - printTable(table); + table.rowsToString().pipe(process.stdout); }); - -function printTable(table: Arrow.Table) { - let header = [...table.columns.map((_, i) => table.key(i))].map(stringify); - let maxColumnWidths = header.map(x => x.length); - // Pass one to convert to strings and count max column widths - for (let i = -1, n = table.length - 1; ++i < n;) { - let val, - row = [i, ...table.get(i)]; - for (let j = -1, k = row.length; ++j < k; ) { - val = stringify(row[j]); - maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); - } - } - console.log(header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(' | ')); - // Pass two to pad each one to max column width - for (let i = -1, n = table.length; ++i < n; ) { - console.log( - [...table.get(i)] - .map(stringify) - .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) - .join(' | ') - ); - } -} - -function leftPad(str: string, fill: string, n: number) { - return (new Array(n + 1).join(fill) + str).slice(-1 * n); -} - -function stringify(x: any) { - return typeof x === 'string' ? `"${x}"` - : Array.isArray(x) ? JSON.stringify(x) - : ArrayBuffer.isView(x) ? `[${x}]` - : `${x}`; -} - -})(); \ No newline at end of file diff --git a/js/src/data.ts b/js/src/data.ts index 5e7fd35431b26..5a6e26da9c2fc 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -15,9 +15,17 @@ // specific language governing permissions and limitations // under the License. -import { VectorLike } from './vector'; -import { VectorType, TypedArray, TypedArrayConstructor } from './type'; -import { DataType, FlatType, ListType, NestedType, Map_, DenseUnion, SparseUnion } from './type'; +import { popcnt_bit_range } from './util/bit'; +import { VectorLike, Vector } from './vector'; +import { VectorType, TypedArray, TypedArrayConstructor, Dictionary } from './type'; +import { Int, Bool, FlatListType, List, FixedSizeList, Struct, Map_ } from './type'; +import { DataType, FlatType, ListType, NestedType, DenseUnion, SparseUnion } from './type'; + +export function toTypedArray(ArrayType: TypedArrayConstructor, values?: T | ArrayLike | Iterable | null): T { + return values instanceof ArrayType ? values + : !values || !ArrayBuffer.isView(values) ? ArrayType.from(values || []) + : new ArrayType(values.buffer, values.byteOffset, values.byteLength / ArrayType.BYTES_PER_ELEMENT); +} export type Data = DataTypes[T['TType']] & BaseData; export interface DataTypes { @@ -25,22 +33,23 @@ export interface DataTypes { /* [Type.Null]*/ 1: FlatData; /* [Type.Int]*/ 2: FlatData; /* [Type.Float]*/ 3: FlatData; -/* [Type.Binary]*/ 4: ListData; -/* [Type.Utf8]*/ 5: ListData; -/* [Type.Bool]*/ 6: FlatData; +/* [Type.Binary]*/ 4: FlatListData; +/* [Type.Utf8]*/ 5: FlatListData; +/* [Type.Bool]*/ 6: BoolData; /* [Type.Decimal]*/ 7: FlatData; /* [Type.Date]*/ 8: FlatData; /* [Type.Time]*/ 9: FlatData; /* [Type.Timestamp]*/ 10: FlatData; /* [Type.Interval]*/ 11: FlatData; -/* [Type.List]*/ 12: ListData; -/* [Type.Struct]*/ 13: FlatData; +/* [Type.List]*/ 12: ListData>; +/* [Type.Struct]*/ 13: NestedData; /* [Type.Union]*/ 14: UnionData; /* [Type.FixedSizeBinary]*/ 15: FlatData; -/* [Type.FixedSizeList]*/ 16: ListData; +/* [Type.FixedSizeList]*/ 16: ListData>; /* [Type.Map]*/ 17: NestedData; /* [Type.DenseUnion]*/ DenseUnion: DenseUnionData; /*[Type.SparseUnion]*/ SparseUnion: SparseUnionData; +/*[ Type.Dictionary]*/ Dictionary: DictionaryData; } // When slicing, we do not know the null count of the sliced range without // doing some computation. To avoid doing this eagerly, we set the null count @@ -52,38 +61,46 @@ export const kUnknownNullCount = -1; export class BaseData implements VectorLike { protected _type: T; protected _length: number; + protected _offset: number; // @ts-ignore - protected _childData: BaseData[]; + protected _childData: Data[]; protected _nullCount: number | kUnknownNullCount; protected /* [VectorType.OFFSET]:*/ 0?: Int32Array; protected /* [VectorType.DATA]:*/ 1?: T['TArray']; protected /*[VectorType.VALIDITY]:*/ 2?: Uint8Array; protected /* [VectorType.TYPE]:*/ 3?: Int8Array; - constructor(type: T, length: number, nullCount?: number) { + constructor(type: T, length: number, offset?: number, nullCount?: number) { this._type = type; - this._length = Math.max(length || 0, 0); - this._nullCount = Math.max(nullCount || 0, -1); + this._length = Math.floor(Math.max(length || 0, 0)); + this._offset = Math.floor(Math.max(offset || 0, 0)); + this._nullCount = Math.floor(Math.max(nullCount || 0, -1)); } public get type() { return this._type; } public get length() { return this._length; } + public get offset() { return this._offset; } public get typeId() { return this._type.TType; } public get childData() { return this._childData; } - public get nullCount() { return this._nullCount; } public get nullBitmap() { return this[VectorType.VALIDITY]; } - public clone(length = this._length, nullCount = this._nullCount) { - return new BaseData(this._type, length, nullCount) as this; + public get nullCount() { + let nullCount = this._nullCount; + let nullBitmap: Uint8Array | undefined; + if (nullCount === -1 && (nullBitmap = this[VectorType.VALIDITY])) { + this._nullCount = nullCount = popcnt_bit_range(nullBitmap, this._offset, this._offset + this._length); + } + return nullCount; + } + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new BaseData(this._type, length, offset, nullCount) as this; } public slice(offset: number, length: number) { return length <= 0 ? this : this.sliceInternal(this.clone( - length, +(this._nullCount === 0) - 1 + length, this._offset + offset, +(this._nullCount === 0) - 1 ), offset, length); } protected sliceInternal(clone: this, offset: number, length: number) { let arr: any; // If typeIds exist, slice the typeIds buffer (arr = this[VectorType.TYPE]) && (clone[VectorType.TYPE] = this.sliceData(arr, offset, length)); - // If a null bitmap exists, slice the null bitmap - (arr = this[VectorType.VALIDITY]) && (clone[VectorType.VALIDITY] = this.sliceNullBitmap(arr, offset, length)); // If offsets exist, only slice the offsets buffer (arr = this[VectorType.OFFSET]) && (clone[VectorType.OFFSET] = this.sliceOffsets(arr, offset, length)) || // Otherwise if no offsets, slice the data buffer @@ -96,65 +113,76 @@ export class BaseData implements VectorLike { protected sliceOffsets(valueOffsets: Int32Array, offset: number, length: number) { return valueOffsets.subarray(offset, offset + length + 1); } - protected sliceNullBitmap(nullBitmap: Uint8Array, offset: number, length: number) { - return length >= 8 - ? nullBitmap.subarray(offset >> 3, ((offset + length) >> 3)) - : nullBitmap.subarray(offset >> 3, ((offset + length) >> 3) + 1); - } } export class FlatData extends BaseData { public /* [VectorType.DATA]:*/ 1: T['TArray']; public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; public get values() { return this[VectorType.DATA]; } - constructor(type: T, length: number, nullBitmap: Uint8Array, data: Iterable, nullCount?: number) { - super(type, length, nullCount); + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, data: Iterable, offset?: number, nullCount?: number) { + super(type, length, offset, nullCount); this[VectorType.DATA] = toTypedArray(this.ArrayType, data); this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); } public get ArrayType(): T['ArrayType'] { return this._type.ArrayType; } - public clone(length = this._length, nullCount = this._nullCount) { - return new FlatData( - this._type, length, this[VectorType.VALIDITY], - this[VectorType.DATA], nullCount - ) as this; + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new FlatData(this._type, length, this[VectorType.VALIDITY], this[VectorType.DATA], offset, nullCount) as this; + } +} + +export class BoolData extends FlatData { + protected sliceData(data: Uint8Array) { return data; } + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new BoolData(this._type, length, this[VectorType.VALIDITY], this[VectorType.DATA], offset, nullCount) as this; } } -export class ListData extends BaseData { +export class FlatListData extends FlatData { public /* [VectorType.OFFSET]:*/ 0: Int32Array; public /* [VectorType.DATA]:*/ 1: T['TArray']; public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; public get values() { return this[VectorType.DATA]; } public get valueOffsets() { return this[VectorType.OFFSET]; } - constructor(type: T, length: number, nullBitmap: Uint8Array, data: T['TArray'], valueOffsets: Iterable, nullCount?: number) { - super(type, length, nullCount); - this[VectorType.DATA] = data; + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, data: T['TArray'], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, data, offset, nullCount); this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); - this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); } - public clone(length = this._length, nullCount = this._nullCount) { - return new ListData( - this._type, length, this[VectorType.VALIDITY], - this[VectorType.DATA], this[VectorType.OFFSET], - nullCount - ) as this; + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new FlatListData(this._type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this[VectorType.DATA], offset, nullCount) as this; + } +} + +export class DictionaryData extends BaseData> { + protected _dictionary: Vector; + protected _indicies: Data>; + public get indicies() { return this._indicies; } + public get dictionary() { return this._dictionary; } + constructor(type: Dictionary, dictionary: Vector, indicies: Data>) { + super(type, indicies.length, (indicies as any)._nullCount); + this._indicies = indicies; + this._dictionary = dictionary; + } + public get length() { return this._indicies.length; } + public get nullCount() { return this._indicies.nullCount; } + public clone(length = this._length, offset = this._offset) { + return new DictionaryData(this._type, this._dictionary, this._indicies.slice(offset - this._offset, length)) as this; + } + protected sliceInternal(clone: this, _offset: number, _length: number) { + clone._length = clone._indicies.length; + clone._nullCount = (clone._indicies as any)._nullCount; + return clone; } } export class NestedData extends BaseData { public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - constructor(type: T, length: number, nullBitmap: Uint8Array, childData: BaseData[], nullCount?: number) { - super(type, length, nullCount); + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, offset, nullCount); this._childData = childData; - this[VectorType.VALIDITY] = nullBitmap; + this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); } - public clone(length = this._length, nullCount = this._nullCount) { - return new NestedData( - this._type, length, - this[VectorType.VALIDITY], - this._childData, nullCount - ) as this; + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new NestedData(this._type, length, this[VectorType.VALIDITY], this._childData, offset, nullCount) as this; } protected sliceInternal(clone: this, offset: number, length: number) { if (!this[VectorType.OFFSET]) { @@ -164,51 +192,112 @@ export class NestedData extends BaseData { } } +export class ListData extends NestedData { + public /* [VectorType.OFFSET]:*/ 0: Int32Array; + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + protected _valuesData: Data; + public get values() { return this._valuesData; } + public get valueOffsets() { return this[VectorType.OFFSET]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, valueChildData: Data, offset?: number, nullCount?: number) { + super(type, length, nullBitmap, [valueChildData], offset, nullCount); + this._valuesData = valueChildData; + this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + } + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new ListData(this._type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this._valuesData, offset, nullCount) as this; + } +} + export class UnionData extends NestedData { public /* [VectorType.TYPE]:*/ 3: T['TArray']; public get typeIds() { return this[VectorType.TYPE]; } - constructor(type: T, length: number, nullBitmap: Uint8Array, typeIds: Iterable, childData: BaseData[], nullCount?: number) { - super(type, length, nullBitmap, childData, nullCount); + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, childData, offset, nullCount); this[VectorType.TYPE] = toTypedArray(Int8Array, typeIds); } - public clone(length = this._length, nullCount = this._nullCount) { - return new UnionData( - this._type, length, this[VectorType.VALIDITY], - this[VectorType.TYPE], this._childData, nullCount - ) as this; + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new UnionData(this._type, length, this[VectorType.VALIDITY], this[VectorType.TYPE], this._childData, offset, nullCount) as this; + } +} + +export class SparseUnionData extends UnionData { + constructor(type: SparseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, typeIds, childData, offset, nullCount); + } + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new SparseUnionData(this._type, length, this[VectorType.VALIDITY], this[VectorType.TYPE], this._childData, offset, nullCount) as this; } } export class DenseUnionData extends UnionData { public /* [VectorType.OFFSET]:*/ 0: Int32Array; public get valueOffsets() { return this[VectorType.OFFSET]; } - constructor(type: DenseUnion, length: number, nullBitmap: Uint8Array, typeIds: Iterable, valueOffsets: Iterable, childData: BaseData[], nullCount?: number) { - super(type, length, nullBitmap, typeIds, childData, nullCount); + constructor(type: DenseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, valueOffsets: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, typeIds, childData, offset, nullCount); this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); } - public clone(length = this._length, nullCount = this._nullCount) { - return new DenseUnionData( - this._type, length, this[VectorType.VALIDITY], - this[VectorType.TYPE], this[VectorType.OFFSET], - this._childData, nullCount - ) as this; + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new DenseUnionData(this._type, length, this[VectorType.VALIDITY], this[VectorType.TYPE], this[VectorType.OFFSET], this._childData, offset, nullCount) as this; } } -export class SparseUnionData extends UnionData { - constructor(type: SparseUnion, length: number, nullBitmap: Uint8Array, typeIds: Iterable, childData: BaseData[], nullCount?: number) { - super(type, length, nullBitmap, typeIds, childData, nullCount); +export class ChunkedData extends BaseData { + protected _childVectors: Vector[]; + protected _childOffsets: Uint32Array; + public get childVectors() { return this._childVectors; } + public get childOffsets() { return this._childOffsets; } + public get childData() { + return this._childData || ( + this._childData = this._childVectors.map(({ data }) => data)); + } + constructor(type: T, length: number, childVectors: Vector[], offset?: number, nullCount?: number, childOffsets?: Uint32Array) { + super(type, length, offset, nullCount); + this._childVectors = childVectors; + this._childOffsets = childOffsets || ChunkedData.computeOffsets(childVectors); } - public clone(length = this._length, nullCount = this._nullCount) { - return new SparseUnionData( - this._type, length, this[VectorType.VALIDITY], - this[VectorType.TYPE], this._childData, nullCount - ) as this; + public get nullCount() { + let nullCount = this._nullCount; + if (nullCount === -1) { + this._nullCount = nullCount = this._childVectors.reduce((x, c) => x + c.nullCount, 0); + } + return nullCount; + } + public clone(length = this._length, offset = this._offset, nullCount = this._nullCount) { + return new ChunkedData(this._type, length, this._childVectors, offset, nullCount, this._childOffsets) as this; + } + protected sliceInternal(clone: this, offset: number, length: number) { + const chunks = this._childVectors; + const offsets = this._childOffsets; + const chunkSlices: Vector[] = []; + for (let childIndex = -1, numChildren = chunks.length; ++childIndex < numChildren;) { + const child = chunks[childIndex]; + const childLength = child.length; + const childOffset = offsets[childIndex]; + // If the child is to the right of the slice boundary, exclude + if (childOffset >= offset + length) { continue; } + // If the child is to the left of of the slice boundary, exclude + if (offset >= childOffset + childLength) { continue; } + // If the child is between both left and right boundaries, include w/o slicing + if (childOffset >= offset && (childOffset + childLength) <= offset + length) { + chunkSlices.push(child); + continue; + } + // If the child overlaps one of the slice boundaries, include that slice + const begin = Math.max(0, offset - childOffset); + const end = begin + Math.min(childLength - begin, (offset + length) - childOffset); + chunkSlices.push(child.slice(begin, end)); + } + clone._childVectors = chunkSlices; + clone._childOffsets = ChunkedData.computeOffsets(chunkSlices); + return clone; + } + static computeOffsets(childVectors: Vector[]) { + const childOffsets = new Uint32Array(childVectors.length + 1); + for (let index = 0, length = childOffsets.length, childOffset = childOffsets[0] = 0; ++index < length;) { + childOffsets[index] = (childOffset += childVectors[index - 1].length); + } + return childOffsets; } } -function toTypedArray(ArrayType: TypedArrayConstructor, values?: T | ArrayLike | Iterable | null): T { - return values instanceof ArrayType ? values - : !values || !ArrayBuffer.isView(values) ? ArrayType.from(values || []) - : new ArrayType(values.buffer, values.byteOffset, values.byteLength / ArrayType.BYTES_PER_ELEMENT); -} +ChunkedData['computeOffsets'] = ChunkedData.computeOffsets; \ No newline at end of file diff --git a/js/src/fb/File.ts b/js/src/fb/File.ts index 56f50ed20e936..f4ba865ff040b 100644 --- a/js/src/fb/File.ts +++ b/js/src/fb/File.ts @@ -14,6 +14,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -175,6 +176,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/fb/Message.ts b/js/src/fb/Message.ts index 4610fbef2e1c8..537c65d1f8c93 100644 --- a/js/src/fb/Message.ts +++ b/js/src/fb/Message.ts @@ -45,6 +45,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -110,6 +111,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -265,6 +267,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -369,6 +372,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/fb/Schema.ts b/js/src/fb/Schema.ts index d9b45ed20089c..4a4aeb65599be 100644 --- a/js/src/fb/Schema.ts +++ b/js/src/fb/Schema.ts @@ -165,6 +165,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -221,6 +222,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -273,6 +275,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -325,6 +328,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -420,6 +424,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -495,6 +500,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -617,6 +623,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -701,6 +708,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -771,6 +779,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -823,6 +832,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -875,6 +885,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -945,6 +956,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -997,6 +1009,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1092,6 +1105,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1164,6 +1178,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1255,6 +1270,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1363,6 +1379,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1435,6 +1452,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1527,6 +1545,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1620,6 +1639,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1741,6 +1761,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -2026,6 +2047,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -2089,6 +2111,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/ipc/message.ts b/js/src/ipc/message.ts deleted file mode 100644 index 57712f84a6d6c..0000000000000 --- a/js/src/ipc/message.ts +++ /dev/null @@ -1,156 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { align } from '../util/bit'; -import * as Schema_ from '../fb/Schema'; -import * as Message_ from '../fb/Message'; -import { flatbuffers } from 'flatbuffers'; -import { DataType, Int, Dictionary } from '../type'; -import { MessageVisitor, VisitorNode } from '../visitor'; - -export import Long = flatbuffers.Long; -export import Endianness = Schema_.org.apache.arrow.flatbuf.Endianness; -export import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; -export import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; - -export class Footer implements Partial { - constructor(public dictionaryBatches: Block[], public recordBatches: Block[], public schema: Schema) {} - acceptMessageVisitor(visitor: MessageVisitor): any { - return visitor.visitFooter(this); - } -} - -export class Block implements Partial { - public readonly offset: number; - public readonly offsetLong: Long; - public readonly bodyLength: number; - public readonly bodyLengthLong: Long; - constructor(offset: Long | number, public metaDataLength: number, bodyLength: Long | number) { - this.offset = (this.offsetLong = typeof offset === 'number' ? new Long(offset, 0) : offset).low; - this.bodyLength = (this.bodyLengthLong = typeof bodyLength === 'number' ? new Long(bodyLength, 0) : bodyLength).low; - } - acceptMessageVisitor(visitor: MessageVisitor): any { - return visitor.visitBlock(this); - } -} - -export class Message implements Partial { - public readonly bodyLength: number; - public readonly bodyLengthLong: Long; - constructor(public version: MetadataVersion, public headerType: MessageHeader, bodyLength: Long | number) { - this.bodyLength = (this.bodyLengthLong = typeof bodyLength === 'number' ? new Long(bodyLength, 0) : bodyLength).low; - } - acceptMessageVisitor(visitor: MessageVisitor): any { - return visitor.visitMessage(this); - } - static isSchema(x: Message): x is Schema { return x.headerType === MessageHeader.Schema; } - static isRecordBatch(x: Message): x is RecordBatch { return x.headerType === MessageHeader.RecordBatch; } - static isDictionaryBatch(x: Message): x is DictionaryBatch { return x.headerType === MessageHeader.DictionaryBatch; } -} - -export class Schema extends Message { - public dictionaries: Map; - constructor(version: MetadataVersion, public fields: Field[], public customMetadata?: Map, public endianness = Endianness.Little) { - super(version, MessageHeader.Schema, Long.ZERO); - this.dictionaries = fields.reduce(function flattenDictionaryFields(dictionaries, f): Map { - if (f.dictionary) { - const id = f.dictionary.id.toString(); - if (dictionaries.has(id)) { - dictionaries.set(id, f); - } - } - return (f.type.children || []).reduce(flattenDictionaryFields, dictionaries); - }, new Map()); - } -} - -export class RecordBatch extends Message { - public readonly length: number; - public readonly lengthLong: Long; - constructor(version: MetadataVersion, length: Long | number, public fieldNodes: FieldNode[], public buffers: Buffer[]) { - super(version, MessageHeader.RecordBatch, buffers.reduce((s, b) => align(s + b.length + (b.offset - s), 8), 0)); - this.length = (this.lengthLong = typeof length === 'number' ? new Long(length, 0) : length).low; - } - acceptMessageVisitor(visitor: MessageVisitor): any { - return visitor.visitRecordBatch(this); - } -} - -export class DictionaryBatch extends Message { - public readonly dictionaryId: number; - public readonly dictionaryIdLong: Long; - constructor(version: MetadataVersion, public dictionary: RecordBatch, dictionaryId: Long | number, public isDelta: boolean) { - super(version, MessageHeader.DictionaryBatch, dictionary.bodyLength); - this.dictionaryId = (this.dictionaryIdLong = typeof dictionaryId === 'number' ? new Long(dictionaryId, 0) : dictionaryId).low; - } - public get fieldNodes(): FieldNode[] { return this.dictionary.fieldNodes; } - public get buffers(): Buffer[] { return this.dictionary.buffers; } - public acceptMessageVisitor(visitor: MessageVisitor): any { - return visitor.visitDictionaryBatch(this); - } - static atomicDictionaryId = 0; -} - -export class Field implements Partial { - constructor(public name: string, - public type: T, - public nullable = false, - public metadata?: Map | null, - public dictionary?: Dictionary | null) { - } - get typeId(): T['TType'] { return this.type.TType; } - get [Symbol.toStringTag](): string { return 'Field'; } - get keys(): Field | Field> { - return !this.dictionary ? this : new Field>( - this.name, this.dictionary.indicies.type, - this.nullable, this.metadata, this.dictionary - ); - } - acceptMessageVisitor(visitor: MessageVisitor): any { - return visitor.visitField(this); - } - toString() { - return this[Symbol.toStringTag] + - ` name[${this.name}]` + - `, nullable[${this.nullable}]` + - `, type[${this.type.toString()}]`; - } -} - -export class Buffer implements Partial { - public readonly offset: number; - public readonly length: number; - constructor(offset: Long | number, length: Long | number) { - this.offset = typeof offset === 'number' ? offset : offset.low; - this.length = typeof length === 'number' ? length : length.low; - } - acceptMessageVisitor(visitor: MessageVisitor): any { - return visitor.visitBuffer(this); - } -} - -export class FieldNode implements Partial { - public readonly length: number; - public readonly nullCount: number; - constructor(length: Long | number, nullCount: Long | number) { - this.length = typeof length === 'number' ? length : length.low; - this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; - } - acceptMessageVisitor(visitor: MessageVisitor): any { - return visitor.visitFieldNode(this); - } -} diff --git a/js/src/ipc/metadata.ts b/js/src/ipc/metadata.ts new file mode 100644 index 0000000000000..88b7e52983b8e --- /dev/null +++ b/js/src/ipc/metadata.ts @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* tslint:disable:class-name */ + +import { align } from '../util/bit'; +import { Schema, Long, MessageHeader, MetadataVersion } from '../type'; + +export class Footer { + constructor(public dictionaryBatches: FileBlock[], public recordBatches: FileBlock[], public schema: Schema) {} +} + +export class FileBlock { + constructor(public metaDataLength: number, public bodyLength: Long, public offset: Long) {} +} + +export class Message { + public bodyLength: number; + public version: MetadataVersion; + public headerType: MessageHeader; + constructor(version: MetadataVersion, bodyLength: Long | number, headerType: MessageHeader) { + this.version = version; + this.headerType = headerType; + this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } + static isSchema(m: Message): m is Schema { return m.headerType === MessageHeader.Schema; } + static isRecordBatch(m: Message): m is RecordBatchMetadata { return m.headerType === MessageHeader.RecordBatch; } + static isDictionaryBatch(m: Message): m is DictionaryBatch { return m.headerType === MessageHeader.DictionaryBatch; } +} + +export class RecordBatchMetadata extends Message { + public length: number; + public nodes: FieldMetadata[]; + public buffers: BufferMetadata[]; + constructor(version: MetadataVersion, length: Long | number, nodes: FieldMetadata[], buffers: BufferMetadata[]) { + super(version, buffers.reduce((s, b) => align(s + b.length + (b.offset - s), 8), 0), MessageHeader.RecordBatch); + this.nodes = nodes; + this.buffers = buffers; + this.length = typeof length === 'number' ? length : length.low; + } +} + +export class DictionaryBatch extends Message { + public id: number; + public isDelta: boolean; + public data: RecordBatchMetadata; + constructor(version: MetadataVersion, data: RecordBatchMetadata, id: Long | number, isDelta: boolean = false) { + super(version, data.bodyLength, MessageHeader.DictionaryBatch); + this.isDelta = isDelta; + this.data = data; + this.id = typeof id === 'number' ? id : id.low; + } + private static atomicDictionaryId = 0; + public static getId() { return DictionaryBatch.atomicDictionaryId++; } + public get nodes(): FieldMetadata[] { return this.data.nodes; } + public get buffers(): BufferMetadata[] { return this.data.buffers; } +} + +export class BufferMetadata { + public offset: number; + public length: number; + constructor(offset: Long | number, length: Long | number) { + this.offset = typeof offset === 'number' ? offset : offset.low; + this.length = typeof length === 'number' ? length : length.low; + } +} + +export class FieldMetadata { + public length: number; + public nullCount: number; + constructor(length: Long | number, nullCount: Long | number) { + this.length = typeof length === 'number' ? length : length.low; + this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; + } +} diff --git a/js/src/ipc/reader/arrow.ts b/js/src/ipc/reader/arrow.ts new file mode 100644 index 0000000000000..bf525533116eb --- /dev/null +++ b/js/src/ipc/reader/arrow.ts @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { readJSON } from './json'; +import { RecordBatch } from '../../table'; +import { readBuffers, readBuffersAsync } from './binary'; +import { readRecordBatches, readRecordBatchesAsync, TypeDataLoader } from './vector'; +import { Schema } from '../../type'; +import { Message } from '../metadata'; + +export { readJSON, RecordBatch }; +export { readBuffers, readBuffersAsync }; +export { readRecordBatches, readRecordBatchesAsync }; + +export function* read(sources: Iterable | object | string) { + let input: any = sources; + let messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>; + if (typeof input === 'string') { + try { input = JSON.parse(input); } + catch (e) { input = sources; } + } + if (!input || typeof input !== 'object') { + messages = (typeof input === 'string') ? readBuffers([input]) : []; + } else { + messages = (typeof input[Symbol.iterator] === 'function') ? readBuffers(input) : readJSON(input); + } + yield* readRecordBatches(messages); +} + +export async function* readAsync(sources: AsyncIterable) { + for await (let recordBatch of readRecordBatchesAsync(readBuffersAsync(sources))) { + yield recordBatch; + } +} diff --git a/js/src/ipc/reader/binary.ts b/js/src/ipc/reader/binary.ts new file mode 100644 index 0000000000000..6e3c7fc5cf080 --- /dev/null +++ b/js/src/ipc/reader/binary.ts @@ -0,0 +1,449 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { flatbuffers } from 'flatbuffers'; +import { TypeDataLoader } from './vector'; +import { Message, Footer, FileBlock, RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata, } from '../metadata'; +import { + Schema, Field, + DataType, Dictionary, + Null, TimeBitWidth, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from '../../type'; + +import { + Int8, Uint8, + Int16, Uint16, + Int32, Uint32, + Int64, Uint64, + Float16, Float64, Float32, +} from '../../type'; + +import ByteBuffer = flatbuffers.ByteBuffer; + +type MessageReader = (bb: ByteBuffer) => IterableIterator; + +export function* readBuffers(sources: Iterable | Uint8Array | Buffer | string) { + let schema: Schema | null = null; + let dictionaries = new Map(); + let readMessages: MessageReader | null = null; + if (ArrayBuffer.isView(sources) || typeof sources === 'string') { + sources = [sources as T]; + } + for (const source of sources) { + const bb = toByteBuffer(source); + if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { + for (const message of readMessages(bb)) { + yield { + schema, message, + loader: new BinaryDataLoader( + bb, + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + } + } +} + +export async function* readBuffersAsync(sources: AsyncIterable) { + let schema: Schema | null = null; + let dictionaries = new Map(); + let readMessages: MessageReader | null = null; + for await (const source of sources) { + const bb = toByteBuffer(source); + if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { + for (const message of readMessages(bb)) { + yield { + schema, message, + loader: new BinaryDataLoader( + bb, + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + } + } +} + +export class BinaryDataLoader extends TypeDataLoader { + private bytes: Uint8Array; + private messageOffset: number; + constructor(bb: ByteBuffer, nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(nodes, buffers, dictionaries); + this.bytes = bb.bytes(); + this.messageOffset = bb.position(); + } + protected readOffsets(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } + protected readTypeIds(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } + protected readData(_type: T, { length, offset }: BufferMetadata = this.getBufferMetadata()) { + return new Uint8Array(this.bytes.buffer, this.bytes.byteOffset + this.messageOffset + offset, length); + } +} + +function* arrayIterator(arr: Array) { yield* arr; } + +function toByteBuffer(bytes?: Uint8Array | Buffer | string) { + let arr: Uint8Array = bytes as any || new Uint8Array(0); + if (typeof bytes === 'string') { + arr = new Uint8Array(bytes.length); + for (let i = -1, n = bytes.length; ++i < n;) { + arr[i] = bytes.charCodeAt(i); + } + return new ByteBuffer(arr); + } + return new ByteBuffer(arr); +} + +function readSchema(bb: ByteBuffer) { + let schema: Schema, readMessages, footer: Footer | null; + if (footer = readFileSchema(bb)) { + schema = footer.schema; + readMessages = readFileMessages(footer); + } else if (schema = readStreamSchema(bb)!) { + readMessages = readStreamMessages; + } else { + throw new Error('Invalid Arrow buffer'); + } + return { schema, readMessages }; +} + +const PADDING = 4; +const MAGIC_STR = 'ARROW1'; +const MAGIC = new Uint8Array(MAGIC_STR.length); +for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { + MAGIC[i] = MAGIC_STR.charCodeAt(i); +} + +function checkForMagicArrowString(buffer: Uint8Array, index = 0) { + for (let i = -1, n = MAGIC.length; ++i < n;) { + if (MAGIC[i] !== buffer[index + i]) { + return false; + } + } + return true; +} + +const magicLength = MAGIC.length; +const magicAndPadding = magicLength + PADDING; +const magicX2AndPadding = magicLength * 2 + PADDING; + +function readStreamSchema(bb: ByteBuffer) { + if (!checkForMagicArrowString(bb.bytes(), 0)) { + for (const message of readMessages(bb)) { + if (Message.isSchema(message)) { + return message as Schema; + } + } + } + return null; +} + +function* readStreamMessages(bb: ByteBuffer) { + for (const message of readMessages(bb)) { + if (Message.isRecordBatch(message)) { + yield message; + } else if (Message.isDictionaryBatch(message)) { + yield message; + } else { + continue; + } + // position the buffer after the body to read the next message + bb.setPosition(bb.position() + message.bodyLength); + } +} + +function readFileSchema(bb: ByteBuffer) { + let fileLength = bb.capacity(), footerLength: number, footerOffset: number; + if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || + (!checkForMagicArrowString(bb.bytes(), 0) /* Missing magic start */) || + (!checkForMagicArrowString(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || + (/* Invalid footer length */ + (footerLength = bb.readInt32(footerOffset = fileLength - magicAndPadding)) < 1 && + (footerLength + magicX2AndPadding > fileLength))) { + return null; + } + bb.setPosition(footerOffset - footerLength); + return footerFromByteBuffer(bb); +} + +function readFileMessages(footer: Footer) { + return function* (bb: ByteBuffer) { + for (let i = -1, batches = footer.dictionaryBatches, n = batches.length; ++i < n;) { + bb.setPosition(batches[i].offset.low); + yield readMessage(bb, bb.readInt32(bb.position())) as DictionaryBatch; + } + for (let i = -1, batches = footer.recordBatches, n = batches.length; ++i < n;) { + bb.setPosition(batches[i].offset.low); + yield readMessage(bb, bb.readInt32(bb.position())) as RecordBatchMetadata; + } + }; +} + +function* readMessages(bb: ByteBuffer) { + let length: number, message: Schema | RecordBatchMetadata | DictionaryBatch; + while (bb.position() < bb.capacity() && + (length = bb.readInt32(bb.position())) > 0) { + if (message = readMessage(bb, length)!) { + yield message; + } + } +} + +function readMessage(bb: ByteBuffer, length: number) { + bb.setPosition(bb.position() + PADDING); + const message = messageFromByteBuffer(bb); + bb.setPosition(bb.position() + length); + return message; +} + +import * as File_ from '../../fb/File'; +import * as Schema_ from '../../fb/Schema'; +import * as Message_ from '../../fb/Message'; + +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; +import _Footer = File_.org.apache.arrow.flatbuf.Footer; +import _Block = File_.org.apache.arrow.flatbuf.Block; +import _Message = Message_.org.apache.arrow.flatbuf.Message; +import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; +import _Field = Schema_.org.apache.arrow.flatbuf.Field; +import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; +import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; +import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; +import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; +import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; +import _Null = Schema_.org.apache.arrow.flatbuf.Null; +import _Int = Schema_.org.apache.arrow.flatbuf.Int; +import _FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; +import _Binary = Schema_.org.apache.arrow.flatbuf.Binary; +import _Bool = Schema_.org.apache.arrow.flatbuf.Bool; +import _Utf8 = Schema_.org.apache.arrow.flatbuf.Utf8; +import _Decimal = Schema_.org.apache.arrow.flatbuf.Decimal; +import _Date = Schema_.org.apache.arrow.flatbuf.Date; +import _Time = Schema_.org.apache.arrow.flatbuf.Time; +import _Timestamp = Schema_.org.apache.arrow.flatbuf.Timestamp; +import _Interval = Schema_.org.apache.arrow.flatbuf.Interval; +import _List = Schema_.org.apache.arrow.flatbuf.List; +import _Struct = Schema_.org.apache.arrow.flatbuf.Struct_; +import _Union = Schema_.org.apache.arrow.flatbuf.Union; +import _FixedSizeBinary = Schema_.org.apache.arrow.flatbuf.FixedSizeBinary; +import _FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; +import _Map = Schema_.org.apache.arrow.flatbuf.Map; + +function footerFromByteBuffer(bb: ByteBuffer) { + const dictionaryFields = new Map>(); + const f = _Footer.getRootAsFooter(bb), s = f.schema()!; + return new Footer( + dictionaryBatchesFromFooter(f), recordBatchesFromFooter(f), + new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), f.version(), dictionaryFields) + ); +} + +function messageFromByteBuffer(bb: ByteBuffer) { + const m = _Message.getRootAsMessage(bb)!, type = m.headerType(), version = m.version(); + switch (type) { + case MessageHeader.Schema: return schemaFromMessage(version, m.header(new _Schema())!, new Map()); + case MessageHeader.RecordBatch: return recordBatchFromMessage(version, m.header(new _RecordBatch())!); + case MessageHeader.DictionaryBatch: return dictionaryBatchFromMessage(version, m.header(new _DictionaryBatch())!); + } + return null; + // throw new Error(`Unrecognized Message type '${type}'`); +} + +function schemaFromMessage(version: MetadataVersion, s: _Schema, dictionaryFields: Map>) { + return new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), version, dictionaryFields); +} + +function recordBatchFromMessage(version: MetadataVersion, b: _RecordBatch) { + return new RecordBatchMetadata(version, b.length(), fieldNodesFromRecordBatch(b), buffersFromRecordBatch(b, version)); +} + +function dictionaryBatchFromMessage(version: MetadataVersion, d: _DictionaryBatch) { + return new DictionaryBatch(version, recordBatchFromMessage(version, d.data()!), d.id(), d.isDelta()); +} + +function dictionaryBatchesFromFooter(f: _Footer) { + const blocks = [] as FileBlock[]; + for (let b: _Block, i = -1, n = f && f.dictionariesLength(); ++i < n;) { + if (b = f.dictionaries(i)!) { + blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); + } + } + return blocks; +} + +function recordBatchesFromFooter(f: _Footer) { + const blocks = [] as FileBlock[]; + for (let b: _Block, i = -1, n = f && f.recordBatchesLength(); ++i < n;) { + if (b = f.recordBatches(i)!) { + blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); + } + } + return blocks; +} + +function fieldsFromSchema(s: _Schema, dictionaryFields: Map> | null) { + const fields = [] as Field[]; + for (let i = -1, c: Field | null, n = s && s.fieldsLength(); ++i < n;) { + if (c = field(s.fields(i)!, dictionaryFields)) { + fields.push(c); + } + } + return fields; +} + +function fieldsFromField(f: _Field, dictionaryFields: Map> | null) { + const fields = [] as Field[]; + for (let i = -1, c: Field | null, n = f && f.childrenLength(); ++i < n;) { + if (c = field(f.children(i)!, dictionaryFields)) { + fields.push(c); + } + } + return fields; +} + +function fieldNodesFromRecordBatch(b: _RecordBatch) { + const fieldNodes = [] as FieldMetadata[]; + for (let i = -1, n = b.nodesLength(); ++i < n;) { + fieldNodes.push(fieldNodeFromRecordBatch(b.nodes(i)!)); + } + return fieldNodes; +} + +function buffersFromRecordBatch(b: _RecordBatch, version: MetadataVersion) { + const buffers = [] as BufferMetadata[]; + for (let i = -1, n = b.buffersLength(); ++i < n;) { + let buffer = b.buffers(i)!; + // If this Arrow buffer was written before version 4, + // advance the buffer's bb_pos 8 bytes to skip past + // the now-removed page id field. + if (version < MetadataVersion.V4) { + buffer.bb_pos += (8 * (i + 1)); + } + buffers.push(bufferFromRecordBatch(buffer)); + } + return buffers; +} + +function field(f: _Field, dictionaryFields: Map> | null) { + let name = f.name()!; + let field: Field | void; + let nullable = f.nullable(); + let metadata = customMetadata(f); + let dataType: DataType | null; + let keysMeta: _Int | null, id: number; + let dictMeta: _DictionaryEncoding | null; + if (!dictionaryFields || !(dictMeta = f.dictionary())) { + if (dataType = typeFromField(f, fieldsFromField(f, dictionaryFields))) { + field = new Field(name, dataType, nullable, metadata); + } + } else if (dataType = dictionaryFields.has(id = dictMeta.id().low) + ? dictionaryFields.get(id)!.type.dictionary + : typeFromField(f, fieldsFromField(f, null))) { + dataType = new Dictionary(dataType, + // a dictionary index defaults to signed 32 bit int if unspecified + (keysMeta = dictMeta.indexType()) ? intFromField(keysMeta)! : new Int32(), + id, dictMeta.isOrdered() + ); + field = new Field(name, dataType, nullable, metadata); + dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); + } + return field || null; +} + +function customMetadata(parent?: _Schema | _Field | null) { + const data = new Map(); + if (parent) { + for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { + if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { + data.set(key, entry.value()!); + } + } + } + return data; +} + +function fieldNodeFromRecordBatch(f: _FieldNode) { + return new FieldMetadata(f.length(), f.nullCount()); +} + +function bufferFromRecordBatch(b: _Buffer) { + return new BufferMetadata(b.offset(), b.length()); +} + +function typeFromField(f: _Field, children?: Field[]): DataType | null { + switch (f.typeType()) { + case Type.NONE: return null; + case Type.Null: return nullFromField(f.type(new _Null())!); + case Type.Int: return intFromField(f.type(new _Int())!); + case Type.FloatingPoint: return floatFromField(f.type(new _FloatingPoint())!); + case Type.Binary: return binaryFromField(f.type(new _Binary())!); + case Type.Utf8: return utf8FromField(f.type(new _Utf8())!); + case Type.Bool: return boolFromField(f.type(new _Bool())!); + case Type.Decimal: return decimalFromField(f.type(new _Decimal())!); + case Type.Date: return dateFromField(f.type(new _Date())!); + case Type.Time: return timeFromField(f.type(new _Time())!); + case Type.Timestamp: return timestampFromField(f.type(new _Timestamp())!); + case Type.Interval: return intervalFromField(f.type(new _Interval())!); + case Type.List: return listFromField(f.type(new _List())!, children || []); + case Type.Struct_: return structFromField(f.type(new _Struct())!, children || []); + case Type.Union: return unionFromField(f.type(new _Union())!, children || []); + case Type.FixedSizeBinary: return fixedSizeBinaryFromField(f.type(new _FixedSizeBinary())!); + case Type.FixedSizeList: return fixedSizeListFromField(f.type(new _FixedSizeList())!, children || []); + case Type.Map: return mapFromField(f.type(new _Map())!, children || []); + } + throw new Error(`Unrecognized type ${f.typeType()}`); +} + +function nullFromField (_type: _Null) { return new Null(); } +function intFromField (_type: _Int) { switch (_type.bitWidth()) { + case 8: return _type.isSigned() ? new Int8() : new Uint8(); + case 16: return _type.isSigned() ? new Int16() : new Uint16(); + case 32: return _type.isSigned() ? new Int32() : new Uint32(); + case 64: return _type.isSigned() ? new Int64() : new Uint64(); + } + return null; } +function floatFromField (_type: _FloatingPoint) { switch (_type.precision()) { + case Precision.HALF: return new Float16(); + case Precision.SINGLE: return new Float32(); + case Precision.DOUBLE: return new Float64(); + } + return null; } +function binaryFromField (_type: _Binary) { return new Binary(); } +function utf8FromField (_type: _Utf8) { return new Utf8(); } +function boolFromField (_type: _Bool) { return new Bool(); } +function decimalFromField (_type: _Decimal) { return new Decimal(_type.scale(), _type.precision()); } +function dateFromField (_type: _Date) { return new Date_(_type.unit()); } +function timeFromField (_type: _Time) { return new Time(_type.unit(), _type.bitWidth() as TimeBitWidth); } +function timestampFromField (_type: _Timestamp) { return new Timestamp(_type.unit(), _type.timezone()); } +function intervalFromField (_type: _Interval) { return new Interval(_type.unit()); } +function listFromField (_type: _List, children: Field[]) { return new List(children); } +function structFromField (_type: _Struct, children: Field[]) { return new Struct(children); } +function unionFromField (_type: _Union, children: Field[]) { return new Union(_type.mode(), (_type.typeIdsArray() || []) as Type[], children); } +function fixedSizeBinaryFromField(_type: _FixedSizeBinary) { return new FixedSizeBinary(_type.byteWidth()); } +function fixedSizeListFromField (_type: _FixedSizeList, children: Field[]) { return new FixedSizeList(_type.listSize(), children); } +function mapFromField (_type: _Map, children: Field[]) { return new Map_(_type.keysSorted(), children); } diff --git a/js/src/ipc/reader/json.ts b/js/src/ipc/reader/json.ts new file mode 100644 index 0000000000000..c3f88f7d1e7ce --- /dev/null +++ b/js/src/ipc/reader/json.ts @@ -0,0 +1,323 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { flatbuffers } from 'flatbuffers'; +import { TypeDataLoader } from './vector'; +import { packBools } from '../../util/bit'; +import { Int64, Int128 } from '../../util/int'; +import { TextEncoder } from 'text-encoding-utf-8'; +import { RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata } from '../metadata'; +import { + Schema, Field, + DataType, Dictionary, + Null, TimeBitWidth, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from '../../type'; + +import { + Int8, Uint8, + Int16, Uint16, + Int32, Uint32, + Int64 as Int64_, Uint64, + Float16, Float64, Float32, +} from '../../type'; + +import Long = flatbuffers.Long; + +export function* readJSON(json: any) { + const schema = schemaFromJSON(json['schema']); + const dictionaries = new Map(); + for (const batch of (json['dictionaries'] || [])) { + const message = dictionaryBatchFromJSON(batch); + yield { + schema, message, + loader: new JSONDataLoader( + flattenDataSources(batch['data']['columns']), + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + for (const batch of (json['batches'] || [])) { + const message = recordBatchFromJSON(batch); + yield { + schema, message, + loader: new JSONDataLoader( + flattenDataSources(batch['columns']), + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } +} + +function* arrayIterator(arr: Array) { yield* arr; } +function flattenDataSources(xs: any[]): any[][] { + return (xs || []).reduce((buffers, column: any) => [ + ...buffers, + ...(column['VALIDITY'] && [column['VALIDITY']] || []), + ...(column['OFFSET'] && [column['OFFSET']] || []), + ...(column['DATA'] && [column['DATA']] || []), + ...flattenDataSources(column['children']) + ], [] as any[][]); +} + +const utf8Encoder = new TextEncoder('utf-8'); + +export class JSONDataLoader extends TypeDataLoader { + constructor(private sources: any[][], nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(nodes, buffers, dictionaries); + } + protected readNullBitmap(_type: T, nullCount: number, { offset } = this.getBufferMetadata()) { + return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); + } + protected readOffsets(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + return new Int32Array(this.sources[offset]); + } + protected readTypeIds(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + return new Int8Array(this.sources[offset]); + } + protected readData(type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + const { sources } = this; + if (DataType.isTimestamp(type) === true) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if ((DataType.isInt(type) || DataType.isTime(type)) && type.bitWidth === 64) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if (DataType.isDate(type) && type.unit === DateUnit.MILLISECOND) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if (DataType.isDecimal(type) === true) { + return new Uint8Array(decimalDataFromJSON(sources[offset] as string[])); + } else if (DataType.isBinary(type) === true) { + return new Uint8Array(binaryDataFromJSON(sources[offset] as string[])); + } else if (DataType.isBool(type) === true) { + return new Uint8Array(packBools(sources[offset] as number[]).buffer); + } else if (DataType.isUtf8(type) === true) { + return utf8Encoder.encode((sources[offset] as string[]).join('')); + } else { + return toTypedArray(type.ArrayType, sources[offset].map((x) => +x)) as any; + } + } +} + +function int64DataFromJSON(values: string[]) { + const data = new Uint32Array(values.length * 2); + for (let i = -1, n = values.length; ++i < n;) { + // Force all values (even numbers) to be parsed as strings since + // pulling out high and low bits seems to lose precision sometimes + // For example: + // > -4613034156400212000 >>> 0 + // 721782784 + // The correct lower 32-bits are 721782752 + Int64.fromString(values[i].toString(), new Uint32Array(data.buffer, data.byteOffset + 2 * i * 4, 2)); + } + return data.buffer; +} + +function decimalDataFromJSON(values: string[]) { + const data = new Uint32Array(values.length * 4); + for (let i = -1, n = values.length; ++i < n;) { + Int128.fromString(values[i], new Uint32Array(data.buffer, data.byteOffset + 4 * 4 * i, 4)); + } + return data.buffer; +} + +function binaryDataFromJSON(values: string[]) { + // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] + // There are definitely more efficient ways to do this... but it gets the + // job done. + const joined = values.join(''); + const data = new Uint8Array(joined.length / 2); + for (let i = 0; i < joined.length; i += 2) { + data[i >> 1] = parseInt(joined.substr(i, 2), 16); + } + return data.buffer; +} + +import * as Schema_ from '../../fb/Schema'; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; +import { toTypedArray } from '../../data'; + +function schemaFromJSON(s: any): Schema { + const dictionaryFields = new Map>(); + return new Schema( + fieldsFromJSON(s['fields'], dictionaryFields), + customMetadata(s['customMetadata']), + MetadataVersion.V4, dictionaryFields + ); +} + +function recordBatchFromJSON(b: any): RecordBatchMetadata { + return new RecordBatchMetadata( + MetadataVersion.V4, + b['count'], + fieldNodesFromJSON(b['columns']), + buffersFromJSON(b['columns']) + ); +} + +function dictionaryBatchFromJSON(b: any): DictionaryBatch { + return new DictionaryBatch( + MetadataVersion.V4, + recordBatchFromJSON(b['data']), + b['id'], b['isDelta'] + ); +} + +function fieldsFromJSON(fs: any[], dictionaryFields: Map> | null): Field[] { + return (fs || []) + .map((f) => fieldFromJSON(f, dictionaryFields)) + .filter((f) => f != null) as Field[]; +} + +function fieldNodesFromJSON(xs: any[]): FieldMetadata[] { + return (xs || []).reduce((fieldNodes, column: any) => [ + ...fieldNodes, + new FieldMetadata( + new Long(column['count'], 0), + new Long(nullCountFromJSON(column['VALIDITY']), 0) + ), + ...fieldNodesFromJSON(column['children']) + ], [] as FieldMetadata[]); +} + +function buffersFromJSON(xs: any[], buffers: BufferMetadata[] = []): BufferMetadata[] { + for (let i = -1, n = (xs || []).length; ++i < n;) { + const column = xs[i]; + column['VALIDITY'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['VALIDITY'].length, 0))); + column['OFFSET'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['OFFSET'].length, 0))); + column['DATA'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['DATA'].length, 0))); + buffers = buffersFromJSON(column['children'], buffers); + } + return buffers; +} + +function nullCountFromJSON(validity: number[]) { + return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); +} + +function fieldFromJSON(f: any, dictionaryFields: Map> | null) { + let name = f['name']; + let field: Field | void; + let nullable = f['nullable']; + let dataType: DataType | null; + let id: number, keysMeta: any, dictMeta: any; + let metadata = customMetadata(f['customMetadata']); + if (!dictionaryFields || !(dictMeta = f['dictionary'])) { + if (dataType = typeFromJSON(f['type'], fieldsFromJSON(f['children'], dictionaryFields))) { + field = new Field(name, dataType, nullable, metadata); + } + } else if (dataType = dictionaryFields.has(id = dictMeta['id']) + ? dictionaryFields.get(id)!.type.dictionary + : typeFromJSON(f['type'], fieldsFromJSON(f['children'], null))) { + dataType = new Dictionary(dataType, + // a dictionary index defaults to signed 32 bit int if unspecified + (keysMeta = dictMeta['indexType']) ? intFromJSON(keysMeta)! : new Int32(), + id, dictMeta['isOrdered'] + ); + field = new Field(name, dataType, nullable, metadata); + dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); + } + return field || null; +} + +function customMetadata(metadata?: any) { + return new Map(Object.entries(metadata || {})); +} + +const namesToTypeMap: { [n: string]: Type } = { + 'NONE': Type.NONE, + 'null': Type.Null, + 'int': Type.Int, + 'floatingpoint': Type.FloatingPoint, + 'binary': Type.Binary, + 'bool': Type.Bool, + 'utf8': Type.Utf8, + 'decimal': Type.Decimal, + 'date': Type.Date, + 'time': Type.Time, + 'timestamp': Type.Timestamp, + 'interval': Type.Interval, + 'list': Type.List, + 'struct': Type.Struct_, + 'union': Type.Union, + 'fixedsizebinary': Type.FixedSizeBinary, + 'fixedsizelist': Type.FixedSizeList, + 'map': Type.Map, +}; + +function typeFromJSON(t: any, children?: Field[]) { + switch (namesToTypeMap[t['name']]) { + case Type.NONE: return null; + case Type.Null: return nullFromJSON(t); + case Type.Int: return intFromJSON(t); + case Type.FloatingPoint: return floatingPointFromJSON(t); + case Type.Binary: return binaryFromJSON(t); + case Type.Utf8: return utf8FromJSON(t); + case Type.Bool: return boolFromJSON(t); + case Type.Decimal: return decimalFromJSON(t); + case Type.Date: return dateFromJSON(t); + case Type.Time: return timeFromJSON(t); + case Type.Timestamp: return timestampFromJSON(t); + case Type.Interval: return intervalFromJSON(t); + case Type.List: return listFromJSON(t, children || []); + case Type.Struct_: return structFromJSON(t, children || []); + case Type.Union: return unionFromJSON(t, children || []); + case Type.FixedSizeBinary: return fixedSizeBinaryFromJSON(t); + case Type.FixedSizeList: return fixedSizeListFromJSON(t, children || []); + case Type.Map: return mapFromJSON(t, children || []); + } + throw new Error(`Unrecognized type ${t['name']}`); +} + +function nullFromJSON (_type: any) { return new Null(); } +function intFromJSON (_type: any) { switch (_type['bitWidth']) { + case 8: return _type['isSigned'] ? new Int8() : new Uint8(); + case 16: return _type['isSigned'] ? new Int16() : new Uint16(); + case 32: return _type['isSigned'] ? new Int32() : new Uint32(); + case 64: return _type['isSigned'] ? new Int64_() : new Uint64(); + } + return null; } +function floatingPointFromJSON (_type: any) { switch (Precision[_type['precision']] as any) { + case Precision.HALF: return new Float16(); + case Precision.SINGLE: return new Float32(); + case Precision.DOUBLE: return new Float64(); + } + return null; } +function binaryFromJSON (_type: any) { return new Binary(); } +function utf8FromJSON (_type: any) { return new Utf8(); } +function boolFromJSON (_type: any) { return new Bool(); } +function decimalFromJSON (_type: any) { return new Decimal(_type['scale'], _type['precision']); } +function dateFromJSON (_type: any) { return new Date_(DateUnit[_type['unit']] as any); } +function timeFromJSON (_type: any) { return new Time(TimeUnit[_type['unit']] as any, _type['bitWidth'] as TimeBitWidth); } +function timestampFromJSON (_type: any) { return new Timestamp(TimeUnit[_type['unit']] as any, _type['timezone']); } +function intervalFromJSON (_type: any) { return new Interval(IntervalUnit[_type['unit']] as any); } +function listFromJSON (_type: any, children: Field[]) { return new List(children); } +function structFromJSON (_type: any, children: Field[]) { return new Struct(children); } +function unionFromJSON (_type: any, children: Field[]) { return new Union(_type['mode'], (_type['typeIdsArray'] || []) as Type[], children); } +function fixedSizeBinaryFromJSON(_type: any) { return new FixedSizeBinary(_type['byteWidth']); } +function fixedSizeListFromJSON (_type: any, children: Field[]) { return new FixedSizeList(_type['listSize'], children); } +function mapFromJSON (_type: any, children: Field[]) { return new Map_(_type['keysSorted'], children); } diff --git a/js/src/ipc/reader/vector.ts b/js/src/ipc/reader/vector.ts new file mode 100644 index 0000000000000..6386eb7b689cd --- /dev/null +++ b/js/src/ipc/reader/vector.ts @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { RecordBatch } from '../../table'; +import { TypeVisitor } from '../../visitor'; +import { FlatType, NestedType, ListType } from '../../type'; +import { Message, FieldMetadata, BufferMetadata } from '../metadata'; +import { FlatData, ListData, NestedData, DenseUnionData, SparseUnionData, BoolData, FlatListData, DictionaryData } from '../../data'; +import { + Schema, Field, + Dictionary, + Null, Int, Float, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, + UnionMode, SparseUnion, DenseUnion, FlatListType, DataType, +} from '../../type'; + +export function* readRecordBatches(messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { + for (const { schema, message, loader } of messages) { + yield* readRecordBatch(schema, message, loader); + } +} + +export async function* readRecordBatchesAsync(messages: AsyncIterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { + for await (const { schema, message, loader } of messages) { + yield* readRecordBatch(schema, message, loader); + } +} + +export function* readRecordBatch(schema: Schema, message: Message, loader: TypeDataLoader) { + if (Message.isRecordBatch(message)) { + yield new RecordBatch(schema, message.length, loader.visitFields(schema.fields)); + } else if (Message.isDictionaryBatch(message)) { + const dictionaryId = message.id; + const dictionaries = loader.dictionaries; + const dictionaryField = schema.dictionaries.get(dictionaryId)!; + const dictionaryDataType = (dictionaryField.type as Dictionary).dictionary; + let dictionaryVector = Vector.create(loader.visit(dictionaryDataType)); + if (message.isDelta && dictionaries.has(dictionaryId)) { + dictionaryVector = dictionaries.get(dictionaryId)!.concat(dictionaryVector); + } + dictionaries.set(dictionaryId, dictionaryVector); + } +} + +export abstract class TypeDataLoader extends TypeVisitor { + + public dictionaries: Map; + protected nodes: Iterator; + protected buffers: Iterator; + + constructor(nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(); + this.nodes = nodes; + this.buffers = buffers; + this.dictionaries = dictionaries; + } + + public visitFields(fields: Field[]) { return fields.map((field) => this.visit(field.type)); } + + public visitNull (type: Null) { return this.visitNullType(type); } + public visitInt (type: Int) { return this.visitFlatType(type); } + public visitFloat (type: Float) { return this.visitFlatType(type); } + public visitBinary (type: Binary) { return this.visitFlatList(type); } + public visitUtf8 (type: Utf8) { return this.visitFlatList(type); } + public visitBool (type: Bool) { return this.visitBoolType(type); } + public visitDecimal (type: Decimal) { return this.visitFlatType(type); } + public visitDate (type: Date_) { return this.visitFlatType(type); } + public visitTime (type: Time) { return this.visitFlatType(type); } + public visitTimestamp (type: Timestamp) { return this.visitFlatType(type); } + public visitInterval (type: Interval) { return this.visitFlatType(type); } + public visitList (type: List) { return this.visitListType(type); } + public visitStruct (type: Struct) { return this.visitNestedType(type); } + public visitUnion (type: Union) { return this.visitUnionType(type); } + public visitFixedSizeBinary(type: FixedSizeBinary) { return this.visitFlatType(type); } + public visitFixedSizeList (type: FixedSizeList) { return this.visitListType(type); } + public visitMap (type: Map_) { return this.visitNestedType(type); } + public visitDictionary (type: Dictionary) { + return new DictionaryData(type, this.dictionaries.get(type.id)!, this.visit(type.indicies)); + } + protected getFieldMetadata() { return this.nodes.next().value; } + protected getBufferMetadata() { return this.buffers.next().value; } + protected readNullBitmap(type: T, nullCount: number, buffer = this.getBufferMetadata()) { + return nullCount > 0 && this.readData(type, buffer) || new Uint8Array(0); + } + protected abstract readData(type: T, buffer?: BufferMetadata): any; + protected abstract readOffsets(type: T, buffer?: BufferMetadata): any; + protected abstract readTypeIds(type: T, buffer?: BufferMetadata): any; + protected visitNullType(type: Null, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatData(type, length, this.readNullBitmap(type, nullCount), new Uint8Array(0), 0, nullCount); + } + protected visitFlatType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatData(type, length, this.readNullBitmap(type, nullCount), this.readData(type), 0, nullCount); + } + protected visitBoolType(type: Bool, { length, nullCount }: FieldMetadata = this.getFieldMetadata(), data?: Uint8Array) { + return new BoolData(type, length, this.readNullBitmap(type, nullCount), data || this.readData(type), 0, nullCount); + } + protected visitFlatList(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readData(type), 0, nullCount); + } + protected visitListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new ListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.visit(type.children![0].type), 0, nullCount); + } + protected visitNestedType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new NestedData(type, length, this.readNullBitmap(type, nullCount), this.visitFields(type.children), 0, nullCount); + } + protected visitUnionType(type: DenseUnion | SparseUnion, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return type.mode === UnionMode.Sparse ? + new SparseUnionData(type as SparseUnion, length, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.visitFields(type.children), 0, nullCount) : + new DenseUnionData(type as DenseUnion, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readTypeIds(type), this.visitFields(type.children), 0, nullCount); + } +} diff --git a/js/src/table.ts b/js/src/table.ts new file mode 100644 index 0000000000000..2bf5da367fbf5 --- /dev/null +++ b/js/src/table.ts @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data, NestedData } from './data'; +import { Schema, DataType, Struct, IterableArrayLike } from './type'; +import { flatbuffers } from 'flatbuffers'; +import { StructView, RowView } from './vector/nested'; +import { read, readAsync } from './ipc/reader/arrow'; +import { View, Vector, createVector } from './vector'; +import { isPromise, isAsyncIterable } from './util/compat'; + +import Long = flatbuffers.Long; + +export class RecordBatch { + public readonly numRows: number; + public readonly schema: Schema; + public readonly data: Data[]; + public readonly columns: Vector[]; + constructor(schema: Schema, numRows: Long | number, columnsOrData: (Data | Vector)[]) { + const data: Data[] = new Array(columnsOrData.length); + const columns: Vector[] = new Array(columnsOrData.length); + for (let index = -1, length = columnsOrData.length; ++index < length;) { + const col: Data | Vector = columnsOrData[index]; + if (col instanceof Vector) { + data[index] = (columns[index] = col as Vector).data; + } else { + columns[index] = createVector(data[index] = col); + } + } + this.data = data; + this.schema = schema; + this.columns = columns; + this.numRows = typeof numRows === 'number' ? numRows : numRows.low; + } + public get numCols() { return this.columns.length; } + public concat(...others: RecordBatch[]): RecordBatch { + return new RecordBatch( + this.schema, + others.reduce((numRows, batch) => numRows + batch.numRows, this.numRows), + others.reduce((columns, batch) => + columns.map((col, idx) => col.concat(batch.columns[idx])), + this.columns + ) + ); + } +} + +export class Table { + + public static from = syncTableFromInputs; + public static fromAsync = asyncTableFromInputs; + public static empty() { return new Table(new Schema([]), []); } + + protected _view: View; + public readonly schema: Schema; + public readonly columns: Vector[]; + + constructor(schema: Schema, columns: Vector[]) { + this.schema = schema; + this.columns = columns; + this._view = new StructView( + new NestedData( + new Struct(schema.fields), + this.numRows, new Uint8Array(0), + columns.map((col) => col.data) + ), + columns + ); + } + public get numCols() { return this.columns.length; } + public get numRows() { return this.columns[0].length; } + public get(index: number) { return this._view.get(index); } + public toArray(): IterableArrayLike { return this._view.toArray(); } + public [Symbol.iterator](): IterableIterator { + return this._view[Symbol.iterator](); + } + public select(...columnNames: string[]) { + const fields = this.schema.fields; + const namesToKeep = columnNames.reduce((xs, x) => (xs[x] = true) && xs, Object.create(null)); + return new Table( + this.schema.select(...columnNames), + this.columns.filter((_, index) => namesToKeep[fields[index].name]) + ); + } + public rowsToString(separator = ' | '): TableToStringIterator { + return new TableToStringIterator(tableRowsToString(this, separator)); + } +} + +export function syncTableFromInputs(sources?: Iterable | object | string) { + let schema: Schema | undefined, columns: Vector[] = []; + if (sources) { + for (let recordBatch of read(sources)) { + schema = schema || recordBatch.schema; + columns = concatVectors(columns, recordBatch.columns); + } + return new Table(schema!, columns); + } + return Table.empty(); +} + +export async function* asyncTableFromInputs(sources?: Iterable | AsyncIterable | Promise | object | string) { + let columns: Vector[] = []; + let schema: Schema | undefined; + if (isAsyncIterable(sources)) { + for await (let recordBatch of readAsync(sources)) { + schema = schema || recordBatch.schema; + columns = concatVectors(columns, recordBatch.columns); + } + return new Table(schema!, columns); + } else if (isPromise(sources)) { + return Table.from(await sources); + } else if (sources) { + return Table.from(sources); + } + return Table.empty(); +} + +export class TableToStringIterator implements IterableIterator { + constructor(private iterator: IterableIterator) {} + [Symbol.iterator]() { return this.iterator; } + next(value?: any) { return this.iterator.next(value); } + throw(error?: any) { return this.iterator.throw && this.iterator.throw(error) || { done: true, value: '' }; } + return(value?: any) { return this.iterator.return && this.iterator.return(value) || { done: true, value: '' }; } + pipe(stream: NodeJS.WritableStream) { + let res: IteratorResult; + let write = () => { + if (stream.writable) { + do { + if ((res = this.next()).done) { break; } + } while (stream.write(res.value + '\n', 'utf8')); + } + if (!res || !res.done) { + stream.once('drain', write); + } else if (!(stream as any).isTTY) { + stream.end('\n'); + } + }; + write(); + } +} + +function *tableRowsToString(table: Table, separator = ' | ') { + const fields = table.schema.fields; + const header = ['row_id', ...fields.map((f) => `${f}`)].map(stringify); + const maxColumnWidths = header.map(x => x.length); + // Pass one to convert to strings and count max column widths + for (let i = -1, n = table.numRows - 1; ++i < n;) { + let val, row = [i, ...table.get(i)]; + for (let j = -1, k = row.length; ++j < k; ) { + val = stringify(row[j]); + maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); + } + } + yield header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(separator); + for (let i = -1, n = table.numRows; ++i < n;) { + yield [i, ...table.get(i)] + .map((x) => stringify(x)) + .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) + .join(separator); + } +} + +function concatVectors(tableVectors: Vector[], batchVectors: Vector[]) { + return tableVectors.length === 0 ? batchVectors : batchVectors.map((vec, i, _vs, col = tableVectors[i]) => + vec && col && col.concat(vec) || col || vec + ) as Vector[]; +} + +function leftPad(str: string, fill: string, n: number) { + return (new Array(n + 1).join(fill) + str).slice(-1 * n); +} + +function stringify(x: any) { + return typeof x === 'string' ? `"${x}"` : ArrayBuffer.isView(x) ? `[${x}]` : JSON.stringify(x); +} diff --git a/js/src/type.ts b/js/src/type.ts index 0c39a449e0cb6..2a9ea12a7164a 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -15,12 +15,12 @@ // specific language governing permissions and limitations // under the License. -import { Data } from './data'; import * as Schema_ from './fb/Schema'; +import * as Message_ from './fb/Message'; import { Vector, View } from './vector'; import { flatbuffers } from 'flatbuffers'; +import { DictionaryBatch } from './ipc/metadata'; import { TypeVisitor, VisitorNode } from './visitor'; -import { Field, DictionaryBatch } from './ipc/message'; export import Long = flatbuffers.Long; export import ArrowType = Schema_.org.apache.arrow.flatbuf.Type; @@ -30,9 +30,63 @@ export import Precision = Schema_.org.apache.arrow.flatbuf.Precision; export import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; export import VectorType = Schema_.org.apache.arrow.flatbuf.VectorType; export import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +export import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +export import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; + +export class Schema { + // @ts-ignore + protected _bodyLength: number; + // @ts-ignore + protected _headerType: MessageHeader; + public readonly fields: Field[]; + public readonly version: MetadataVersion; + public readonly metadata?: Map; + public readonly dictionaries: Map>; + constructor(fields: Field[], + metadata?: Map, + version: MetadataVersion = MetadataVersion.V4, + dictionaries: Map> = new Map()) { + this.fields = fields; + this.version = version; + this.metadata = metadata; + this.dictionaries = dictionaries; + } + public get bodyLength() { return this._bodyLength; } + public get headerType() { return this._headerType; } + public select(...fieldNames: string[]): Schema { + const namesToKeep = fieldNames.reduce((xs, x) => (xs[x] = true) && xs, Object.create(null)); + const newDictFields = new Map(), newFields = this.fields.filter((f) => namesToKeep[f.name]); + this.dictionaries.forEach((f, dictId) => (namesToKeep[f.name]) && newDictFields.set(dictId, f)); + return new Schema(newFields, this.metadata, this.version, newDictFields); + } + public static [Symbol.toStringTag] = ((prototype: Schema) => { + prototype._bodyLength = 0; + prototype._headerType = MessageHeader.Schema; + return 'Schema'; + })(Schema.prototype); +} + +export class Field { + public readonly type: T; + public readonly name: string; + public readonly nullable: boolean; + public readonly metadata?: Map | null; + constructor(name: string, type: T, nullable = false, metadata?: Map | null) { + this.name = name; + this.type = type; + this.nullable = nullable; + this.metadata = metadata; + } + public toString() { return `${this.name}: ${this.type}`; } + public get typeId(): T['TType'] { return this.type.TType; } + public get [Symbol.toStringTag](): string { return 'Field'; } + public get indicies(): T | Int { + return DataType.isDictionary(this.type) ? this.type.indicies : this.type; + } +} export type TimeBitWidth = 32 | 64; -export type IntBitWidth = 1 | 8 | 16 | 32 | 64; +export type IntBitWidth = 8 | 16 | 32 | 64; export type NumericType = Int | Float | Date_ | Time | Interval | Timestamp; export type FixedSizeType = Int64 | Uint64 | Decimal | FixedSizeBinary; @@ -40,7 +94,7 @@ export type PrimitiveType = NumericType | FixedSizeType; export type FlatListType = Utf8 | Binary; // <-- these types have `offset`, `data`, and `validity` buffers export type FlatType = Bool | PrimitiveType | FlatListType; // <-- these types have `data` and `validity` buffers -export type ListType = List | FixedSizeList | FlatListType; // <-- these types have `offset` and `validity` buffers +export type ListType = List | FixedSizeList; // <-- these types have `offset` and `validity` buffers export type NestedType = Map_ | Struct | List | FixedSizeList | Union; // <-- these types have `validity` buffer and nested childData /** @@ -52,7 +106,7 @@ export type NestedType = Map_ | Struct | List | FixedSizeList | Union< * nested type consisting of other data types, or another data type (e.g. a * timestamp encoded as an int64) */ - export const enum Type { + export enum Type { NONE = 0, // The default placeholder type Null = 1, // A NULL type having no physical storage Int = 2, // Signed or unsigned 8, 16, 32, or 64-bit little-endian integer @@ -85,7 +139,8 @@ export interface DataType { export abstract class DataType implements Partial { - public get [Symbol.toStringTag]() { return 'DataType'; } + // @ts-ignore + public [Symbol.toStringTag]: string; static isNull (x: DataType): x is Null { return x.TType === Type.Null; } static isInt (x: DataType): x is Int { return x.TType === Type.Int; } @@ -134,140 +189,203 @@ export abstract class DataType implements Partial { + ( proto).ArrayType = Array; + return proto[Symbol.toStringTag] = 'DataType'; + })(DataType.prototype); +} + +DataType['isNull'] = DataType.isNull; +DataType['isInt'] = DataType.isInt; +DataType['isFloat'] = DataType.isFloat; +DataType['isBinary'] = DataType.isBinary; +DataType['isUtf8'] = DataType.isUtf8; +DataType['isBool'] = DataType.isBool; +DataType['isDecimal'] = DataType.isDecimal; +DataType['isDate'] = DataType.isDate; +DataType['isTime'] = DataType.isTime; +DataType['isTimestamp'] = DataType.isTimestamp; +DataType['isInterval'] = DataType.isInterval; +DataType['isList'] = DataType.isList; +DataType['isStruct'] = DataType.isStruct; +DataType['isUnion'] = DataType.isUnion; +DataType['isDenseUnion'] = DataType.isDenseUnion; +DataType['isSparseUnion'] = DataType.isSparseUnion; +DataType['isFixedSizeBinary'] = DataType.isFixedSizeBinary; +DataType['isFixedSizeList'] = DataType.isFixedSizeList; +DataType['isMap'] = DataType.isMap; +DataType['isDictionary'] = DataType.isDictionary; export interface Null extends DataType { TArray: void; TValue: null; } export class Null extends DataType { constructor() { super(Type.Null); } - public get [Symbol.toStringTag]() { return 'Null'; } public toString() { return `Null`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitNull(this); } + protected static [Symbol.toStringTag] = ((proto: Null) => { + return proto[Symbol.toStringTag] = 'Null'; + })(Null.prototype); } export interface Int extends DataType { TArray: TArrayType; TValue: TValueType; } export class Int extends DataType { - // @ts-ignore - public readonly ArrayType: TypedArrayConstructor; constructor(public readonly isSigned: boolean, public readonly bitWidth: IntBitWidth) { super(Type.Int); } - public get [Symbol.toStringTag]() { return 'Int'; } - public toString() { return `${this.isSigned ? `` : `u`}int${this.bitWidth}`; } + // @ts-ignore + public readonly ArrayType: TypedArrayConstructor; + public toString() { return `${this.isSigned ? `I` : `Ui`}nt${this.bitWidth}`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitInt(this); } + protected static [Symbol.toStringTag] = ((proto: Int) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Int'; + })(Int.prototype); } export class Int8 extends Int { constructor() { super(true, 8); } - public get [Symbol.toStringTag]() { return 'Int8'; } - public get ArrayType() { return Int8Array; } + protected static [Symbol.toStringTag] = ((proto: Int8) => { + ( proto).ArrayType = Int8Array; + return proto[Symbol.toStringTag] = 'Int8'; + })(Int8.prototype); } export class Int16 extends Int { constructor() { super(true, 16); } - public get [Symbol.toStringTag]() { return 'Int16'; } - public get ArrayType() { return Int16Array; } + protected static [Symbol.toStringTag] = ((proto: Int16) => { + ( proto).ArrayType = Int16Array; + return proto[Symbol.toStringTag] = 'Int16'; + })(Int16.prototype); } export class Int32 extends Int { constructor() { super(true, 32); } - public get [Symbol.toStringTag]() { return 'Int32'; } - public get ArrayType() { return Int32Array; } + protected static [Symbol.toStringTag] = ((proto: Int32) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Int32'; + })(Int32.prototype); } export class Int64 extends Int { constructor() { super(true, 64); } - public get [Symbol.toStringTag]() { return 'Int64'; } - public get ArrayType() { return Int32Array; } + protected static [Symbol.toStringTag] = ((proto: Int64) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Int64'; + })(Int64.prototype); } export class Uint8 extends Int { constructor() { super(false, 8); } - public get [Symbol.toStringTag]() { return 'Uint8'; } - public get ArrayType() { return Uint8Array; } + protected static [Symbol.toStringTag] = ((proto: Uint8) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Uint8'; + })(Uint8.prototype); } export class Uint16 extends Int { constructor() { super(false, 16); } - public get [Symbol.toStringTag]() { return 'Uint16'; } - public get ArrayType() { return Uint16Array; } + protected static [Symbol.toStringTag] = ((proto: Uint16) => { + ( proto).ArrayType = Uint16Array; + return proto[Symbol.toStringTag] = 'Uint16'; + })(Uint16.prototype); } export class Uint32 extends Int { constructor() { super(false, 32); } - public get [Symbol.toStringTag]() { return 'Uint32'; } - public get ArrayType() { return Uint32Array; } + protected static [Symbol.toStringTag] = ((proto: Uint32) => { + ( proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Uint32'; + })(Uint32.prototype); } export class Uint64 extends Int { constructor() { super(false, 64); } - public get [Symbol.toStringTag]() { return 'Uint64'; } - public get ArrayType() { return Uint32Array; } + protected static [Symbol.toStringTag] = ((proto: Uint64) => { + ( proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Uint64'; + })(Uint64.prototype); } export interface Float extends DataType { TArray: TArrayType; TValue: number; } export class Float extends DataType { - // @ts-ignore - public readonly ArrayType: TypedArrayConstructor; - public get [Symbol.toStringTag]() { return 'Float'; } constructor(public readonly precision: Precision) { super(Type.Float); } - public toString() { return `Float precision[${this.precision}]`; } + // @ts-ignore + public readonly ArrayType: TypedArrayConstructor; + public toString() { return `Float${(this.precision << 5) || 16}`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFloat(this); } + protected static [Symbol.toStringTag] = ((proto: Float) => { + return proto[Symbol.toStringTag] = 'Float'; + })(Float.prototype); } +export interface Float16 extends Float {} export class Float16 extends Float { constructor() { super(Precision.HALF); } - public get [Symbol.toStringTag]() { return 'Float16'; } - public get ArrayType() { return Uint16Array; } + protected static [Symbol.toStringTag] = ((proto: Float16) => { + ( proto).ArrayType = Uint16Array; + return proto[Symbol.toStringTag] = 'Float16'; + })(Float16.prototype); } +export interface Float32 extends Float {} export class Float32 extends Float { constructor() { super(Precision.SINGLE); } - public get [Symbol.toStringTag]() { return 'Float32'; } - public get ArrayType() { return Float32Array; } + protected static [Symbol.toStringTag] = ((proto: Float32) => { + ( proto).ArrayType = Float32Array; + return proto[Symbol.toStringTag] = 'Float32'; + })(Float32.prototype); } +export interface Float64 extends Float {} export class Float64 extends Float { constructor() { super(Precision.DOUBLE); } - public get [Symbol.toStringTag]() { return 'Float64'; } - public get ArrayType() { return Float64Array; } + protected static [Symbol.toStringTag] = ((proto: Float64) => { + ( proto).ArrayType = Float64Array; + return proto[Symbol.toStringTag] = 'Float64'; + })(Float64.prototype); } export interface Binary extends DataType { TArray: Uint8Array; TValue: Uint8Array; } export class Binary extends DataType { constructor() { super(Type.Binary); } - public get [Symbol.toStringTag]() { return 'Binary'; } public toString() { return `Binary`; } - public get ArrayType() { return Uint8Array; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitBinary(this); } + protected static [Symbol.toStringTag] = ((proto: Binary) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Binary'; + })(Binary.prototype); } export interface Utf8 extends DataType { TArray: Uint8Array; TValue: string; } export class Utf8 extends DataType { constructor() { super(Type.Utf8); } - public get [Symbol.toStringTag]() { return 'Utf8'; } public toString() { return `Utf8`; } - public get ArrayType() { return Uint8Array; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitUtf8(this); } + protected static [Symbol.toStringTag] = ((proto: Utf8) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8'; + })(Utf8.prototype); } export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; } export class Bool extends DataType { constructor() { super(Type.Bool); } - public get [Symbol.toStringTag]() { return 'Bool'; } public toString() { return `Bool`; } - public get ArrayType() { return Uint8Array; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitBool(this); } + protected static [Symbol.toStringTag] = ((proto: Bool) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Bool'; + })(Bool.prototype); } export interface Decimal extends DataType { TArray: Uint32Array; TValue: Uint32Array; } @@ -276,22 +394,28 @@ export class Decimal extends DataType { public readonly precision: number) { super(Type.Decimal); } - public get [Symbol.toStringTag]() { return 'Decimal'; } - public get ArrayType() { return Uint32Array; } - public toString() { return `Decimal scale[${this.scale}], precision[${this.precision}]`; } + public toString() { return `Decimal[${this.precision}e${this.scale > 0 ? `+` : ``}${this.scale}]`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitDecimal(this); } + protected static [Symbol.toStringTag] = ((proto: Decimal) => { + ( proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Decimal'; + })(Decimal.prototype); } /* tslint:disable:class-name */ export interface Date_ extends DataType { TArray: Int32Array; TValue: Date; } export class Date_ extends DataType { constructor(public readonly unit: DateUnit) { super(Type.Date); } - public get [Symbol.toStringTag]() { return 'Date_'; } - public get ArrayType() { return Int32Array; } - public toString() { return `Date unit[${this.unit}]`; } - public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitDate(this); } + public toString() { return `Date${(this.unit + 1) * 32}<${DateUnit[this.unit]}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitDate(this); + } + protected static [Symbol.toStringTag] = ((proto: Date_) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Date'; + })(Date_.prototype); } export interface Time extends DataType { TArray: Uint32Array; TValue: number; } @@ -300,10 +424,14 @@ export class Time extends DataType { public readonly bitWidth: TimeBitWidth) { super(Type.Time); } - public get [Symbol.toStringTag]() { return 'Time'; } - public get ArrayType() { return Int32Array; } - public toString() { return `Time unit[${this.unit}], bitWidth[${this.bitWidth}]`; } - public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitTime(this); } + public toString() { return `Time${this.bitWidth}<${TimeUnit[this.unit]}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitTime(this); + } + protected static [Symbol.toStringTag] = ((proto: Time) => { + ( proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Time'; + })(Time.prototype); } export interface Timestamp extends DataType { TArray: Int32Array; TValue: number; } @@ -311,12 +439,14 @@ export class Timestamp extends DataType { constructor(public unit: TimeUnit, public timezone?: string | null) { super(Type.Timestamp); } - public get [Symbol.toStringTag]() { return 'Timestamp'; } - public get ArrayType() { return Int32Array; } - public toString() { return `Timestamp unit[${this.unit}], timezone[${this.timezone}]`; } + public toString() { return `Timestamp<${TimeUnit[this.unit]}${this.timezone ? `, ${this.timezone}` : ``}>`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitTimestamp(this); } + protected static [Symbol.toStringTag] = ((proto: Timestamp) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Timestamp'; + })(Timestamp.prototype); } export interface Interval extends DataType { TArray: Int32Array; TValue: Int32Array; } @@ -324,12 +454,14 @@ export class Interval extends DataType { constructor(public unit: IntervalUnit) { super(Type.Interval); } - public get [Symbol.toStringTag]() { return 'Interval'; } - public get ArrayType() { return Int32Array; } - public toString() { return `Interval unit[${this.unit}]`; } + public toString() { return `Interval<${IntervalUnit[this.unit]}>`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitInterval(this); } + protected static [Symbol.toStringTag] = ((proto: Interval) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Interval'; + })(Interval.prototype); } export interface List extends DataType { TArray: any; TValue: Vector; } @@ -337,55 +469,63 @@ export class List extends DataType { constructor(public children: Field[]) { super(Type.List, children); } - public get [Symbol.toStringTag]() { return 'List'; } - public toString() { return `List`; } + public toString() { return `List<${this.valueType}>`; } + public get ArrayType() { return this.valueType.ArrayType; } public get valueType() { return this.children[0].type as T; } public get valueField() { return this.children[0] as Field; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitList(this); } + protected static [Symbol.toStringTag] = ((proto: List) => { + return proto[Symbol.toStringTag] = 'List'; + })(List.prototype); } -export interface Struct extends DataType { TArray: Uint8Array; TValue: View; } +export interface Struct extends DataType { TArray: any; TValue: View; } export class Struct extends DataType { constructor(public children: Field[]) { super(Type.Struct, children); } - public get [Symbol.toStringTag]() { return 'Struct'; } - public toString() { return `Struct`; } + public toString() { return `Struct<${this.children.map((f) => f.type).join(`, `)}>`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitStruct(this); } + protected static [Symbol.toStringTag] = ((proto: Struct) => { + return proto[Symbol.toStringTag] = 'Struct'; + })(Struct.prototype); } export interface Union extends DataType { TArray: Int8Array; TValue: any; } export class Union extends DataType { - constructor(TType: TType, - public readonly mode: UnionMode, + constructor(public readonly mode: UnionMode, public readonly typeIds: ArrowType[], public readonly children: Field[]) { - super(TType, children); + super( (mode === UnionMode.Sparse ? Type.SparseUnion : Type.DenseUnion), children); } - public get [Symbol.toStringTag]() { return 'Union'; } - public get ArrayType() { return Int8Array; } - public toString() { return `Union mode[${this.mode}] typeIds[${this.typeIds}]`; } + public toString() { return `${this[Symbol.toStringTag]}<${this.typeIds.map((x) => Type[x]).join(` | `)}>`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitUnion(this); } + protected static [Symbol.toStringTag] = ((proto: Union) => { + ( proto).ArrayType = Int8Array; + return proto[Symbol.toStringTag] = 'Union'; + })(Union.prototype); } export class DenseUnion extends Union { constructor(typeIds: ArrowType[], children: Field[]) { - super(Type.DenseUnion, UnionMode.Dense, typeIds, children); + super(UnionMode.Dense, typeIds, children); } - public get [Symbol.toStringTag]() { return 'DenseUnion'; } - public toString() { return `DenseUnion typeIds[${this.typeIds}]`; } + protected static [Symbol.toStringTag] = ((proto: DenseUnion) => { + return proto[Symbol.toStringTag] = 'DenseUnion'; + })(DenseUnion.prototype); } export class SparseUnion extends Union { constructor(typeIds: ArrowType[], children: Field[]) { - super(Type.SparseUnion, UnionMode.Sparse, typeIds, children); + super(UnionMode.Sparse, typeIds, children); } - public get [Symbol.toStringTag]() { return 'SparseUnion'; } - public toString() { return `SparseUnion typeIds[${this.typeIds}]`; } + protected static [Symbol.toStringTag] = ((proto: SparseUnion) => { + return proto[Symbol.toStringTag] = 'SparseUnion'; + })(SparseUnion.prototype); } export interface FixedSizeBinary extends DataType { TArray: Uint8Array; TValue: Uint8Array; } @@ -393,10 +533,12 @@ export class FixedSizeBinary extends DataType { constructor(public readonly byteWidth: number) { super(Type.FixedSizeBinary); } - public get [Symbol.toStringTag]() { return 'FixedSizeBinary'; } - public get ArrayType() { return Uint8Array; } - public toString() { return `FixedSizeBinary byteWidth[${this.byteWidth}]`; } + public toString() { return `FixedSizeBinary[${this.byteWidth}]`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFixedSizeBinary(this); } + protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'FixedSizeBinary'; + })(FixedSizeBinary.prototype); } export interface FixedSizeList extends DataType { TArray: any; TValue: Vector; } @@ -405,11 +547,14 @@ export class FixedSizeList extends DataType; } - public toString() { return `FixedSizeList listSize[${this.listSize}]`; } + public toString() { return `FixedSizeList[${this.listSize}]<${this.valueType}>`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFixedSizeList(this); } + protected static [Symbol.toStringTag] = ((proto: FixedSizeList) => { + return proto[Symbol.toStringTag] = 'FixedSizeList'; + })(FixedSizeList.prototype); } /* tslint:disable:class-name */ @@ -419,30 +564,34 @@ export class Map_ extends DataType { public readonly children: Field[]) { super(Type.Map, children); } - public get [Symbol.toStringTag]() { return 'Map'; } - public toString() { return `Map keysSorted[${this.keysSorted}]`; } + public toString() { return `Map<${this.children.join(`, `)}>`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitMap(this); } + protected static [Symbol.toStringTag] = ((proto: Map_) => { + return proto[Symbol.toStringTag] = 'Map'; + })(Map_.prototype); } export interface Dictionary extends DataType { TArray: T['TArray']; TValue: T['TValue']; } export class Dictionary extends DataType { public readonly id: number; - public readonly indicies: Data; + public readonly dictionary: T; + public readonly indicies: Int; public readonly isOrdered: boolean; - public readonly dictionary: Data; - constructor(dictionary: Data, indicies: Data, id?: Long | number | null, isOrdered?: boolean | null) { + constructor(dictionary: T, indicies: Int, id?: Long | number | null, isOrdered?: boolean | null) { super(Type.Dictionary); - this.indicies = indicies; // a dictionary index defaults to signed 32 bit int if unspecified + this.indicies = indicies; this.dictionary = dictionary; this.isOrdered = isOrdered || false; - this.id = id == null ? - DictionaryBatch.atomicDictionaryId++ : - typeof id === 'number' ? id : id.low ; + this.id = id == null ? DictionaryBatch.getId() : typeof id === 'number' ? id : id.low; } - public get [Symbol.toStringTag]() { return 'Dictionary'; } + public get ArrayType() { return this.dictionary.ArrayType; } + public toString() { return `Dictionary<${this.dictionary}, ${this.indicies}>`; } public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitDictionary(this); } + protected static [Symbol.toStringTag] = ((proto: Dictionary) => { + return proto[Symbol.toStringTag] = 'Dictionary'; + })(Dictionary.prototype); } export interface IterableArrayLike extends ArrayLike, Iterable {} diff --git a/js/src/util/bit.ts b/js/src/util/bit.ts index e326f62cb67f6..2308bf6a2e03c 100644 --- a/js/src/util/bit.ts +++ b/js/src/util/bit.ts @@ -33,20 +33,40 @@ export function getBit(_data: any, _index: number, byte: number, bit: number): 0 return (byte & 1 << bit) >> bit as (0 | 1); } +export function setBool(bytes: Uint8Array, index: number, value: any) { + return value ? + !!(bytes[index >> 3] |= (1 << (index % 8))) || true : + !(bytes[index >> 3] &= ~(1 << (index % 8))) && false ; +} + +export function packBools(values: Iterable) { + let n = 0, i = 0; + let xs: number[] = []; + let bit = 0, byte = 0; + for (const value of values) { + value && (byte |= 1 << bit); + if (++bit === 8) { + xs[i++] = byte; + byte = bit = 0; + } + } + if (i === 0 || bit > 0) { xs[i++] = byte; } + if (i % 8 && (n = i + 8 - i % 8)) { + do { xs[i] = 0; } while (++i < n); + } + return new Uint8Array(xs); +} + export function* iterateBits(bytes: Uint8Array, begin: number, length: number, context: any, get: (context: any, index: number, byte: number, bit: number) => T) { let bit = begin % 8; - let bitLen = 8 - bit; let byteIndex = begin >> 3; - let byte = bytes[byteIndex]; - let remaining = length, index = begin; - while (remaining > 0) { - while (++bit < bitLen) { + let index = 0, remaining = length; + for (; remaining > 0; bit = 0) { + let byte = bytes[byteIndex++]; + do { yield get(context, index++, byte, bit); - } - bit = 0; - byte = bytes[++byteIndex]; - bitLen = Math.min(8, remaining -= 8); + } while (--remaining > 0 && ++bit < 8); } } diff --git a/js/src/util/compat.ts b/js/src/util/compat.ts new file mode 100644 index 0000000000000..7a4232ee8c32e --- /dev/null +++ b/js/src/util/compat.ts @@ -0,0 +1,49 @@ +export interface Subscription { + unsubscribe: () => void; +} + +export interface Observer { + closed?: boolean; + next: (value: T) => void; + error: (err: any) => void; + complete: () => void; +} + +export interface Observable { + subscribe: (observer: Observer) => Subscription; +} + +/** + * @ignore + */ +export function isPromise(x: any): x is PromiseLike { + return x != null && Object(x) === x && typeof x['then'] === 'function'; +} + +/** + * @ignore + */ +export function isObservable(x: any): x is Observable { + return x != null && Object(x) === x && typeof x['subscribe'] === 'function'; +} + +/** + * @ignore + */ +export function isArrayLike(x: any): x is ArrayLike { + return x != null && Object(x) === x && typeof x['length'] === 'number'; +} + +/** + * @ignore + */ +export function isIterable(x: any): x is Iterable { + return x != null && Object(x) === x && typeof x[Symbol.iterator] !== 'undefined'; +} + +/** + * @ignore + */ +export function isAsyncIterable(x: any): x is AsyncIterable { + return x != null && Object(x) === x && typeof x[Symbol.asyncIterator] !== 'undefined'; +} diff --git a/js/src/util/layout.ts b/js/src/util/layout.ts index 5f62f4fd9516f..29698fb3d2b93 100644 --- a/js/src/util/layout.ts +++ b/js/src/util/layout.ts @@ -15,17 +15,10 @@ // specific language governing permissions and limitations // under the License. +import { align } from './bit'; import { TextEncoder } from 'text-encoding-utf-8'; import { TypedArrayConstructor, TypedArray } from '../type'; -export function align(value: number, alignment: number) { - return value + padding(value, alignment); -} - -export function padding(value: number, alignment: number) { - return (value % alignment === 0 ? 0 : alignment - value % alignment); -} - export type NullableLayout = { nullCount: number, validity: Uint8Array }; export type BufferLayout> = { data: TArray }; export type DictionaryLayout> = { data: TArray, keys: number[] }; diff --git a/js/src/vector.ts b/js/src/vector.ts index 17f70717f39da..59ff4da5a786a 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -15,30 +15,39 @@ // specific language governing permissions and limitations // under the License. -import { Data } from './data'; +import { Data, ChunkedData } from './data'; import { VisitorNode, TypeVisitor, VectorVisitor } from './visitor'; -import { DataType, ListType, FlatType, NestedType } from './type'; +import { DataType, ListType, FlatType, NestedType, FlatListType } from './type'; import { IterableArrayLike, Precision, DateUnit, IntervalUnit, UnionMode } from './type'; export interface VectorLike { length: number; nullCount: number; } export interface View { + clone(data: Data): this; isValid(index: number): boolean; get(index: number): T['TValue'] | null; + set(index: number, value: T['TValue']): void; toArray(): IterableArrayLike; [Symbol.iterator](): IterableIterator; } -export class Vector implements VectorLike, View, Partial { +export class Vector implements VectorLike, View, VisitorNode { + public static create(data: Data): Vector { + return createVector(data); + } // @ts-ignore protected _data: Data; // @ts-ignore protected _view: View; constructor(data: Data, view: View) { - this._view = view; - const nullBitmap = (this._data = data).nullBitmap; - if (nullBitmap && nullBitmap.length > 0 && data.nullCount > 0) { - this._view = new ValidityView(data, this._view); + this._data = data; + let nulls: Uint8Array; + if (( data instanceof ChunkedData) && !(view instanceof ChunkedView)) { + this._view = new ChunkedView(data); + } else if (!(view instanceof ValidityView) && (nulls = data.nullBitmap!) && nulls.length > 0 && data.nullCount > 0) { + this._view = new ValidityView(data, view); + } else { + this._view = view; } } @@ -47,21 +56,52 @@ export class Vector implements VectorLike, View, Pa public get length() { return this._data.length; } public get nullCount() { return this._data.nullCount; } public get nullBitmap() { return this._data.nullBitmap; } - public get [Symbol.toStringTag]() { return `Vector<${this.type[Symbol.toStringTag]}>`; } - - public isValid(index: number): boolean { return this._view.isValid(index); } - public get(index: number): T['TValue'] | null { return this._view.get(index); } - public toArray(): IterableArrayLike { return this._view.toArray(); } - public [Symbol.iterator](): IterableIterator { return this._view[Symbol.iterator](); } + public get [Symbol.toStringTag]() { + return `Vector<${this.type[Symbol.toStringTag]}>`; + } + public toJSON() { return this.toArray(); } + public clone(data: Data): this { + return this._view.clone(this._data = data) && this || this; + } + public isValid(index: number): boolean { + return this._view.isValid(index); + } + public get(index: number): T['TValue'] | null { + return this._view.get(index); + } + public set(index: number, value: T['TValue']): void { + return this._view.set(index, value); + } + public toArray(): IterableArrayLike { + return this._view.toArray(); + } + public [Symbol.iterator](): IterableIterator { + return this._view[Symbol.iterator](); + } + public concat(...others: Vector[]): this { + if ((others = others.filter(Boolean)).length === 0) { + return this; + } + const { _view: view } = this; + const vecs = !(view instanceof ChunkedView) + ? [this, ...others] + : [...view.chunks, ...others]; + const offsets = ChunkedData.computeOffsets(vecs); + const chunksLength = offsets[offsets.length - 1]; + const chunkedData = new ChunkedData(this.type, chunksLength, vecs, 0, -1, offsets); + return new (this.constructor as any)(chunkedData, new ChunkedView(chunkedData)) as this; + } public slice(begin?: number, end?: number): this { - let total = this.length, from = begin || 0; - let to = typeof end === 'number' ? end : total; - if (to < 0) { to = total + to; } + let { length } = this; + let size = (this._view as any).size || 1; + let total = length, from = (begin || 0) * size; + let to = (typeof end === 'number' ? end : total) * size; + if (to < 0) { to = total - (to * -1) % total; } if (from < 0) { from = total - (from * -1) % total; } - if (to < from) { from = to; to = begin || 0; } + if (to < from) { [from, to] = [to, from]; } total = !isFinite(total = (to - from)) || total < 0 ? 0 : total; - const data = this._data.slice(from, Math.min(total, this.length)); - return new (this.constructor as any)(data, this._view) as this; + const newData = this._data.slice(from, Math.min(total, length)); + return new (this.constructor as any)(newData, this._view.clone(newData)) as this; } public acceptTypeVisitor(visitor: TypeVisitor): any { @@ -76,7 +116,7 @@ export abstract class FlatVector extends Vector { public get values() { return this._data.values; } } -export abstract class ListVectorBase extends Vector { +export abstract class ListVectorBase extends Vector { public get values() { return this._data.values; } public get valueOffsets() { return this._data.valueOffsets; } public getValueOffset(index: number) { @@ -102,156 +142,170 @@ import { List, Binary, Utf8, Bool, } from './type'; import { Null, Int, Float, Float16, Decimal, Date_, Time, Timestamp, Interval } from './type'; import { Struct, Union, SparseUnion, DenseUnion, FixedSizeBinary, FixedSizeList, Map_, Dictionary } from './type'; +import { ChunkedView } from './vector/chunked'; import { DictionaryView } from './vector/dictionary'; import { ListView, FixedSizeListView, BinaryView, Utf8View } from './vector/list'; import { UnionView, DenseUnionView, NestedView, StructView, MapView } from './vector/nested'; import { FlatView, NullView, BoolView, ValidityView, FixedSizeView, Float16View, DateDayView, DateMillisecondView, IntervalYearMonthView } from './vector/flat'; export class NullVector extends Vector { - constructor(data: Data) { - super(data, new NullView(data)); + constructor(data: Data, view: View = new NullView(data)) { + super(data, view); } } export class BoolVector extends Vector { - constructor(data: Data) { - super(data, new BoolView(data)); + constructor(data: Data, view: View = new BoolView(data)) { + super(data, view); } } -export class IntVector extends FlatVector> { - constructor(data: Data, view: View = IntVector.viewForBitWidth(data)) { - super(data, view); - } - static viewForBitWidth(data: Data) { +export class IntVector> extends FlatVector { + static defaultView(data: Data) { return data.type.bitWidth <= 32 ? new FlatView(data) : new FixedSizeView(data, (data.type.bitWidth / 32) | 0); } + constructor(data: Data, view: View = IntVector.defaultView(data)) { + super(data, view); + } } -export class FloatVector extends FlatVector> { - constructor(data: Data) { - super(data, data.type.precision !== Precision.HALF ? - new FlatView(data) : - new Float16View(data as Data)); +export class FloatVector> extends FlatVector { + static defaultView(data: Data): FlatView { + return data.type.precision !== Precision.HALF ? new FlatView(data) : new Float16View(data as Data); + } + constructor(data: Data, view: View = FloatVector.defaultView(data)) { + super(data, view); } } export class DateVector extends FlatVector { - constructor(data: Data) { - super(data, data.type.unit === DateUnit.DAY ? new DateDayView(data) : new DateMillisecondView(data, 2)); + static defaultView(data: Data) { + return data.type.unit === DateUnit.DAY ? new DateDayView(data) : new DateMillisecondView(data, 2); + } + constructor(data: Data, view: View = DateVector.defaultView(data)) { + super(data, view); } } export class DecimalVector extends FlatVector { - constructor(data: Data) { - super(data, new FixedSizeView(data, 4)); + constructor(data: Data, view: View = new FixedSizeView(data, 4)) { + super(data, view); } } export class TimeVector extends FlatVector