diff --git a/lib/saxes.js b/lib/saxes.js index 011dead6..5d76168f 100644 --- a/lib/saxes.js +++ b/lib/saxes.js @@ -83,6 +83,7 @@ exports.EVENTS = [ ]; const NL = 0xA; +const CR = 0xD; const SPACE = 0x20; const BANG = 0x21; const DQUOTE = 0x22; @@ -105,7 +106,7 @@ function isQuote(c) { } const QUOTES = [DQUOTE, SQUOTE]; -const S = [SPACE, NL, 0xD, 9]; +const S = [SPACE, NL, CR, 9]; const TEXT_TERMINATOR = [LESS, AMP]; const DOCTYPE_TERMINATOR = [...QUOTES, OPEN_BRACKET, GREATER]; @@ -305,6 +306,7 @@ class SaxesParser { this.chunk = ""; this.chunkPosition = 0; this.i = 0; + this.trailingCR = false; /** * A map of entity name to expansion. * @@ -479,9 +481,13 @@ class SaxesParser { if (this.closed) { return this.fail("cannot write after close; assign an onready handler."); } + + let end = false; if (chunk === null) { - return this.end(); + end = true; + chunk = ""; } + if (typeof chunk === "object") { chunk = chunk.toString(); } @@ -491,7 +497,23 @@ class SaxesParser { // than the current repeated calls to ``codePointAt``. As of August 2018, it // isn't. (There may be Node-specific code that would perform faster than // ``Array.from`` but don't want to be dependent on Node.) - const limit = this.limit = chunk.length; + + let limit = chunk.length; + + if (this.trailingCR) { + // The previous chunk had a trailing cr. We need to handle it now. + chunk = `\r${chunk}`; + } + + if (!end && chunk[limit - 1] === CR) { + // The chunk ends with a trailing CR. We cannot know how to handle it + // until we get the next chunk or the end of the stream. So save it for + // later. + limit--; + this.trailingCR = true; + } + this.limit = limit; + this.chunk = chunk; this.i = 0; while (this.i < limit) { @@ -499,7 +521,7 @@ class SaxesParser { } this.chunkPosition += limit; - return this; + return end ? this.end() : this; } /** @@ -521,12 +543,29 @@ class SaxesParser { * @returns {number} The character read. */ getCode() { - const { chunk, i } = this; + const { chunk } = this; + let { i } = this; // Using charCodeAt and handling the surrogates ourselves is faster // than using codePointAt. let code = chunk.charCodeAt(i); let skip = 1; + if (code === CR) { + // We may get undefined if we read past the end of the chunk, which is + // fine. + const next = chunk.charCodeAt(i + 1); + if (next === NL) { + // A \r\n sequence is converted to \n so we have to skip over the next + // character. We already know it has a size of 1 so ++ is fine here. + i++; + } + // Otherwise, a \r is just converted to \n, so we don't have to skip + // ahead. + + // In either case, \r becomes \n. + code = NL; + } + if (code === NL) { this.line++; this.column = 0; diff --git a/test/eol-handling.js b/test/eol-handling.js new file mode 100644 index 00000000..8a07e9d3 --- /dev/null +++ b/test/eol-handling.js @@ -0,0 +1,58 @@ +"use strict"; + +const { test } = require("."); + +/* eslint-disable linebreak-style */ +const xml = `\ + + + + abc + def + ghi + xx xx + +`; +/* eslint-enable linebreak-style */ + +const expect = [ + ["text", "\n\n"], + ["opentagstart", { name: "moo", attributes: {} }], + ["opentag", { + name: "moo", + attributes: { + a: "12\n 3", + }, + isSelfClosing: false, + }], + ["text", "\n abc\n def\r\n ghi\n\n xx\nxx\n"], + ["closetag", { + name: "moo", + attributes: { + a: "12\n 3", + }, + isSelfClosing: false, + }], + ["text", "\n"], +]; + +describe("eol handling", () => { + test({ + name: "one chunk", + xml, + expect, + }); + + test({ + name: "char-by-char", + expect, + fn(parser) { + for (const x of xml) { + parser.write(x); + } + parser.close(); + }, + }); +}); diff --git a/test/parser-position.js b/test/parser-position.js index bf826941..60995ff6 100644 --- a/test/parser-position.js +++ b/test/parser-position.js @@ -7,14 +7,19 @@ const { test } = require("."); function testPosition(name, chunks, expectedEvents) { it(name, () => { const parser = new saxes.SaxesParser(); - for (const expectation of expectedEvents) { - parser[`on${expectation[0]}`] = function handler() { + let expectedIx = 0; + for (const ev of saxes.EVENTS) { + // eslint-disable-next-line no-loop-func + parser[`on${ev}`] = () => { + const expectation = expectedEvents[expectedIx++]; + expect(expectation[0]).to.equal(ev); // eslint-disable-next-line guard-for-in for (const prop in expectation[1]) { expect(parser[prop]).to.deep.equal(expectation[1][prop]); } }; } + for (const chunk of chunks) { parser.write(chunk); } @@ -40,6 +45,19 @@ describe("parser position", () => { ["closetag", { position: 19 }], ]); + testPosition( + "with various newlines", + ["
abcde\r\nf\rgh
"], [ + ["opentagstart", { position: 5, line: 1, column: 5 }], + ["opentag", { position: 5, line: 1, column: 5 }], + ["text", { position: 17, line: 2, column: 5 }], + ["opentagstart", { position: 17, line: 2, column: 5 }], + ["opentag", { position: 18, line: 2, column: 6 }], + ["closetag", { position: 18, line: 2, column: 6 }], + ["text", { position: 28, line: 3, column: 8 }], + ["closetag", { position: 28, line: 3, column: 8 }], + ]); + test({ name: "pi before root", xml: "",