From 9589054331fdf1a7d5228790d8ab594f4bfd6305 Mon Sep 17 00:00:00 2001 From: Chet Brittingham Date: Mon, 5 Aug 2019 14:39:56 -0500 Subject: [PATCH 1/2] Feature: add maxRows for #275 --- docs/parsing.md | 1 + src/parser/ParserOptions.ts | 9 +++++++++ src/parser/parser/Parser.ts | 5 ++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/parsing.md b/docs/parsing.md index 351753b3..d043459f 100644 --- a/docs/parsing.md +++ b/docs/parsing.md @@ -46,6 +46,7 @@ * `rtrim: {boolean} = false`: Set to `true` to right trim all fields. * `ltrim: {boolean} = false`: Set to `true` to left trim all fields. * `encoding: {string} = 'utf8'`: Passed to [StringDecoder](https://nodejs.org/api/string_decoder.html#string_decoder_new_stringdecoder_encoding) when decoding incoming buffers. Change if incoming content is not 'utf8' encoded. +- `maxRows: {number}`: Up to the given number of rows will be returned if set to a number greater than 0 (e.g., `100` would return the first 100 rows of data). ## Events diff --git a/src/parser/ParserOptions.ts b/src/parser/ParserOptions.ts index 350a24d8..8716bda3 100644 --- a/src/parser/ParserOptions.ts +++ b/src/parser/ParserOptions.ts @@ -15,6 +15,7 @@ export interface ParserOptionsArgs{ ltrim?: boolean; rtrim?: boolean; encoding?: string; + maxRows?: number; } export class ParserOptions { @@ -56,6 +57,10 @@ export class ParserOptions { public readonly encoding: string = 'utf8'; + public readonly limitRows: boolean = false; + + public readonly maxRows: number = -1; + public constructor(opts?: ParserOptionsArgs) { Object.assign(this, opts || {}); if (this.delimiter.length > 1) { @@ -65,5 +70,9 @@ export class ParserOptions { this.escapeChar = isString(this.escape) ? this.escape : this.quote; this.supportsComments = !isNil(this.comment); this.NEXT_TOKEN_REGEXP = new RegExp(`([^\\s]|\\r\\n|\\n|\\r|${this.escapedDelimiter})`); + + if (this.maxRows > 0) { + this.limitRows = true; + } } } diff --git a/src/parser/parser/Parser.ts b/src/parser/parser/Parser.ts index d00189b9..b73597ed 100644 --- a/src/parser/parser/Parser.ts +++ b/src/parser/parser/Parser.ts @@ -69,9 +69,11 @@ export default class Parser { return { line: scanner.line, rows }; } + private rowCount: number = 0; + private parseRow(scanner: Scanner, rows: RowArray[]): boolean { const nextToken = scanner.nextNonSpaceToken; - if (!nextToken) { + if (!nextToken || (this.parserOptions.limitRows && this.rowCount >= this.parserOptions.maxRows)) { return false; } const row = this.rowParser.parse(scanner); @@ -82,6 +84,7 @@ export default class Parser { return true; } rows.push(row); + this.rowCount += 1; return true; } } From 06eab1b182df170d27fe5bc1e10e6a2577b3c5b4 Mon Sep 17 00:00:00 2001 From: doug-martin Date: Sat, 14 Dec 2019 23:49:02 -0600 Subject: [PATCH 2/2] MaxRows option * [ADDED] `maxRows` option to limit the number of rows parsed. #275 #277 --- History.md | 4 +++ benchmark/.eslintrc.js | 4 ++- benchmark/index.js | 20 ++++++------- docs/parsing.md | 44 +++++++++++++++++++++++++++- examples/parsing/max_rows.example.js | 24 +++++++++++++++ src/parser/CsvParserStream.ts | 24 +++++++++++---- src/parser/ParserOptions.ts | 2 +- src/parser/parser/Parser.ts | 5 +--- test/parser/CsvParsingStream.test.ts | 18 ++++++++++++ test/parser/ParserOptions.test.ts | 20 +++++++++++++ 10 files changed, 143 insertions(+), 22 deletions(-) create mode 100644 examples/parsing/max_rows.example.js diff --git a/History.md b/History.md index 03f87eb5..251a00cc 100644 --- a/History.md +++ b/History.md @@ -1,3 +1,7 @@ +# v3.5.1 + +* [ADDED] `maxRows` option to limit the number of rows parsed. [#275](https://github.com/C2FO/fast-csv/issues/275) [#277](https://github.com/C2FO/fast-csv/pull/277) - [@cbrittingham](https://github.com/cbrittingham) + # v3.5.0 * Upgraded dependencies diff --git a/benchmark/.eslintrc.js b/benchmark/.eslintrc.js index 19068549..377d30db 100644 --- a/benchmark/.eslintrc.js +++ b/benchmark/.eslintrc.js @@ -1,6 +1,8 @@ module.exports = { + parserOptions: { + project: null, + }, rules: { "no-console": 0, - "@typescript-eslint/no-var-requires": 0 }, }; diff --git a/benchmark/index.js b/benchmark/index.js index b60a7ccf..f4a4ded5 100644 --- a/benchmark/index.js +++ b/benchmark/index.js @@ -2,7 +2,6 @@ const path = require('path'); const fs = require('fs'); const fastCsv = require('..'); - function camelize(str) { return str.replace(/_(.)/g, (a, b) => b.toUpperCase()); } @@ -11,7 +10,7 @@ const promisfyStream = (stream, expectedRows) => { let count = 0; return new Promise((res, rej) => { stream - .on('data', (row) => { + .on('data', row => { count += 1; }) .on('end', () => { @@ -25,13 +24,14 @@ const promisfyStream = (stream, expectedRows) => { }); }; -const benchmarkFastCsv = type => (num) => { +const benchmarkFastCsv = type => num => { const file = path.resolve(__dirname, `./assets/${num}.${type}.csv`); - const stream = fs.createReadStream(file) - .pipe(fastCsv.parse({ headers: true })) - .transform((data) => { + const stream = fs + .createReadStream(file) + .pipe(fastCsv.parse({ headers: true, maxRows: 10 })) + .transform(data => { const ret = {}; - [ 'first_name', 'last_name', 'email_address' ].forEach((prop) => { + ['first_name', 'last_name', 'email_address'].forEach(prop => { ret[camelize(prop)] = data[prop]; }); ret.address = data.address; @@ -47,7 +47,7 @@ async function benchmarkRun(title, num, m) { for (let i = 0; i < howMany; i += 1) { // eslint-disable-next-line no-await-in-loop await m(num); - console.log('%s: RUN(%d lines) 1 %dms', title, num, (new Date() - runStart)); + console.log('%s: RUN(%d lines) 1 %dms', title, num, new Date() - runStart); runStart = new Date(); } console.log('%s: 3xAVG for %d lines %dms', title, num, (new Date() - start) / howMany); @@ -55,7 +55,7 @@ async function benchmarkRun(title, num, m) { function runBenchmarks(num, type) { console.log(`\nRUNNING ${num}.${type}.csv benchmarks`, num); - return benchmarkRun('fast-csv', num, benchmarkFastCsv(type)) + return benchmarkRun('fast-csv', num, benchmarkFastCsv(type)); } function benchmarks(type) { @@ -67,7 +67,7 @@ function benchmarks(type) { benchmarks('nonquoted') .then(() => benchmarks('quoted')) .then(() => process.exit()) - .catch((e) => { + .catch(e => { console.error(e.stack); return process.exit(1); }); diff --git a/docs/parsing.md b/docs/parsing.md index 4aacd7c4..de6fd830 100644 --- a/docs/parsing.md +++ b/docs/parsing.md @@ -17,6 +17,7 @@ * [Ignoring Empty Rows](#csv-parse-ignoring-empty-rows) * [Transforming Rows](#csv-parse-transforming) * [Validating Rows](#csv-parse-validation) + * [Max Rows](#max-rows) ## Options @@ -45,7 +46,7 @@ * `rtrim: {boolean} = false`: Set to `true` to right trim all fields. * `ltrim: {boolean} = false`: Set to `true` to left trim all fields. * `encoding: {string} = 'utf8'`: Passed to [StringDecoder](https://nodejs.org/api/string_decoder.html#string_decoder_new_stringdecoder_encoding) when decoding incoming buffers. Change if incoming content is not 'utf8' encoded. -- `maxRows: {number}`: Up to the given number of rows will be returned if set to a number greater than 0 (e.g., `100` would return the first 100 rows of data). +- `maxRows: {number}`: If number is `> 0` the specified number of rows will be parsed.(e.g. `100` would return the first 100 rows of data). ## Events @@ -586,3 +587,44 @@ Valid [row={"firstName":"timmy","lastName":"yukon"}] Parsed 2 rows ``` + +[`examples/parsing/max_rows.example.example.js`](../examples/parsing/max_rows.example.js) + +In the following example there are 10 rows, but only 5 will be parsed because of the `maxRows` option. + +```javascript +const rows = [ + 'header1,header2\n', + 'col1,col1\n', + 'col2,col2\n', + 'col3,col3\n', + 'col4,col4\n', + 'col5,col5\n', + 'col6,col6\n', + 'col7,col7\n', + 'col8,col8\n', + 'col9,col9\n', + 'col10,col10', +]; + +const stream = csv + .parse({ headers: true, maxRows: 5 }) + .on('error', error => console.error(error)) + .on('data', row => console.log(row)) + .on('end', rowCount => console.log(`Parsed ${rowCount} rows`)); + +rows.forEach(row => stream.write(row)); +stream.end(); +``` + +Expected output + +``` +{ header1: 'col1', header2: 'col1' } +{ header1: 'col2', header2: 'col2' } +{ header1: 'col3', header2: 'col3' } +{ header1: 'col4', header2: 'col4' } +{ header1: 'col5', header2: 'col5' } +Parsed 5 rows +``` + diff --git a/examples/parsing/max_rows.example.js b/examples/parsing/max_rows.example.js new file mode 100644 index 00000000..261f92bf --- /dev/null +++ b/examples/parsing/max_rows.example.js @@ -0,0 +1,24 @@ +const csv = require('../../'); + +const rows = [ + 'header1,header2\n', + 'col1,col1\n', + 'col2,col2\n', + 'col3,col3\n', + 'col4,col4\n', + 'col5,col5\n', + 'col6,col6\n', + 'col7,col7\n', + 'col8,col8\n', + 'col9,col9\n', + 'col10,col10', +]; + +const stream = csv + .parse({ headers: true, maxRows: 5 }) + .on('error', error => console.error(error)) + .on('data', row => console.log(row)) + .on('end', rowCount => console.log(`Parsed ${rowCount} rows`)); + +rows.forEach(row => stream.write(row)); +stream.end(); diff --git a/src/parser/CsvParserStream.ts b/src/parser/CsvParserStream.ts index b740f317..c4460c1d 100644 --- a/src/parser/CsvParserStream.ts +++ b/src/parser/CsvParserStream.ts @@ -31,6 +31,10 @@ export default class CsvParserStream extends Transform { this.rowTransformerValidator = new RowTransformerValidator(); } + private get hasHitRowLimit(): boolean { + return this.parserOptions.limitRows && this.rowCount >= this.parserOptions.maxRows; + } + public transform(transformFunction: RowTransformFunction): CsvParserStream { this.rowTransformerValidator.rowTransform = transformFunction; return this; @@ -54,23 +58,31 @@ export default class CsvParserStream extends Transform { } public _transform(data: Buffer, encoding: string, done: TransformCallback): void { + // if we have hit our maxRows parsing limit then skip parsing + if (this.hasHitRowLimit) { + return done(); + } try { const { lines } = this; const newLine = lines + this.decoder.write(data); const rows = this.parse(newLine, true); - this.processRows(rows, done); + return this.processRows(rows, done); } catch (e) { - done(e); + return done(e); } } public _flush(done: TransformCallback): void { + // if we have hit our maxRows parsing limit then skip parsing + if (this.hasHitRowLimit) { + return done(); + } try { const newLine = this.lines + this.decoder.end(); const rows = this.parse(newLine, false); - this.processRows(rows, done); + return this.processRows(rows, done); } catch (e) { - done(e); + return done(e); } } @@ -86,7 +98,9 @@ export default class CsvParserStream extends Transform { private processRows(rows: string[][], cb: TransformCallback): void { const rowsLength = rows.length; const iterate = (i: number): void => { - if (i >= rowsLength) { + // if we have emitted all rows or we have hit the maxRows limit option + // then end + if (i >= rowsLength || this.hasHitRowLimit) { return cb(); } const row = rows[i]; diff --git a/src/parser/ParserOptions.ts b/src/parser/ParserOptions.ts index 4d5329b1..6db33b29 100644 --- a/src/parser/ParserOptions.ts +++ b/src/parser/ParserOptions.ts @@ -60,7 +60,7 @@ export class ParserOptions { public readonly limitRows: boolean = false; - public readonly maxRows: number = -1; + public readonly maxRows: number = 0; public constructor(opts?: ParserOptionsArgs) { Object.assign(this, opts || {}); diff --git a/src/parser/parser/Parser.ts b/src/parser/parser/Parser.ts index 2a740393..bf3e1b62 100644 --- a/src/parser/parser/Parser.ts +++ b/src/parser/parser/Parser.ts @@ -70,11 +70,9 @@ export default class Parser { return { line: scanner.line, rows }; } - private rowCount: number = 0; - private parseRow(scanner: Scanner, rows: RowArray[]): boolean { const nextToken = scanner.nextNonSpaceToken; - if (!nextToken || (this.parserOptions.limitRows && this.rowCount >= this.parserOptions.maxRows)) { + if (!nextToken) { return false; } const row = this.rowParser.parse(scanner); @@ -85,7 +83,6 @@ export default class Parser { return true; } rows.push(row); - this.rowCount += 1; return true; } } diff --git a/test/parser/CsvParsingStream.test.ts b/test/parser/CsvParsingStream.test.ts index b4018d14..4a5ba5dc 100644 --- a/test/parser/CsvParsingStream.test.ts +++ b/test/parser/CsvParsingStream.test.ts @@ -267,6 +267,24 @@ describe('CsvParserStream', () => { }); }); + describe('maxRows', () => { + it('should parse up to the specified number of maxRows', () => { + const maxRows = 3; + parseContentAndCollect(assets.withHeaders, { headers: true, maxRows }).then(({ count, rows }) => { + assert.deepStrictEqual(rows, assets.withHeaders.parsed.slice(0, maxRows)); + assert.strictEqual(count, maxRows); + }); + }); + + it('should parse all rows if maxRows === 0', () => { + const maxRows = 0; + parseContentAndCollect(assets.withHeaders, { headers: true, maxRows }).then(({ count, rows }) => { + assert.deepStrictEqual(rows, assets.withHeaders.parsed); + assert.strictEqual(count, rows.length); + }); + }); + }); + it('should emit an error for malformed rows', next => { assets.write(assets.malformed); const stream = csv.parseFile(assets.malformed.path, { headers: true }); diff --git a/test/parser/ParserOptions.test.ts b/test/parser/ParserOptions.test.ts index c3bea9b5..98e828b0 100644 --- a/test/parser/ParserOptions.test.ts +++ b/test/parser/ParserOptions.test.ts @@ -163,4 +163,24 @@ describe('ParserOptions', () => { assert.strictEqual(createOptions({ renameHeaders: false }).renameHeaders, false); }); }); + + describe('#maxRows', () => { + it('should default maxRows 0 and limitRows to false', () => { + const opts = createOptions(); + assert.strictEqual(opts.maxRows, 0); + assert.strictEqual(opts.limitRows, false); + }); + + it('should set maxRows to the provided option and limitRows to true if maxRows > 0', () => { + const opts = createOptions({ maxRows: 1 }); + assert.strictEqual(opts.maxRows, 1); + assert.strictEqual(opts.limitRows, true); + }); + + it('should set maxRows to the provided option and limitRows to true if maxRows === 0', () => { + const opts = createOptions({ maxRows: 0 }); + assert.strictEqual(opts.maxRows, 0); + assert.strictEqual(opts.limitRows, false); + }); + }); });