Skip to content

Commit

Permalink
fix: normalize \r\n and \r followed by something else to \n
Browse files Browse the repository at this point in the history
Closes #2
  • Loading branch information
lddubeau committed Aug 31, 2018
1 parent dcf84d0 commit d7b1abe
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 7 deletions.
49 changes: 44 additions & 5 deletions lib/saxes.js
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ exports.EVENTS = [
];

const NL = 0xA;
const CR = 0xD;
const SPACE = 0x20;
const BANG = 0x21;
const DQUOTE = 0x22;
Expand All @@ -105,7 +106,7 @@ function isQuote(c) {
}

const QUOTES = [DQUOTE, SQUOTE];
const S = [SPACE, NL, 0xD, 9];
const S = [SPACE, NL, CR, 9];

const TEXT_TERMINATOR = [LESS, AMP];
const DOCTYPE_TERMINATOR = [...QUOTES, OPEN_BRACKET, GREATER];
Expand Down Expand Up @@ -305,6 +306,7 @@ class SaxesParser {
this.chunk = "";
this.chunkPosition = 0;
this.i = 0;
this.trailingCR = false;
/**
* A map of entity name to expansion.
*
Expand Down Expand Up @@ -479,9 +481,13 @@ class SaxesParser {
if (this.closed) {
return this.fail("cannot write after close; assign an onready handler.");
}

let end = false;
if (chunk === null) {
return this.end();
end = true;
chunk = "";
}

if (typeof chunk === "object") {
chunk = chunk.toString();
}
Expand All @@ -491,15 +497,31 @@ class SaxesParser {
// than the current repeated calls to ``codePointAt``. As of August 2018, it
// isn't. (There may be Node-specific code that would perform faster than
// ``Array.from`` but don't want to be dependent on Node.)
const limit = this.limit = chunk.length;

let limit = chunk.length;

if (this.trailingCR) {
// The previous chunk had a trailing cr. We need to handle it now.
chunk = `\r${chunk}`;
}

if (!end && chunk[limit - 1] === CR) {
// The chunk ends with a trailing CR. We cannot know how to handle it
// until we get the next chunk or the end of the stream. So save it for
// later.
limit--;
this.trailingCR = true;
}
this.limit = limit;

this.chunk = chunk;
this.i = 0;
while (this.i < limit) {
this[this.state]();
}
this.chunkPosition += limit;

return this;
return end ? this.end() : this;
}

/**
Expand All @@ -521,12 +543,29 @@ class SaxesParser {
* @returns {number} The character read.
*/
getCode() {
const { chunk, i } = this;
const { chunk } = this;
let { i } = this;
// Using charCodeAt and handling the surrogates ourselves is faster
// than using codePointAt.
let code = chunk.charCodeAt(i);
let skip = 1;

if (code === CR) {
// We may get undefined if we read past the end of the chunk, which is
// fine.
const next = chunk.charCodeAt(i + 1);
if (next === NL) {
// A \r\n sequence is converted to \n so we have to skip over the next
// character. We already know it has a size of 1 so ++ is fine here.
i++;
}
// Otherwise, a \r is just converted to \n, so we don't have to skip
// ahead.

// In either case, \r becomes \n.
code = NL;
}

if (code === NL) {
this.line++;
this.column = 0;
Expand Down
58 changes: 58 additions & 0 deletions test/eol-handling.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"use strict";

const { test } = require(".");

/* eslint-disable linebreak-style */
const xml = `\
<?xml version="1.0" encoding="utf-8"?>
<moo a="12
3"
>
abc
def&#xD;
ghi
xxxx
</moo>
`;
/* eslint-enable linebreak-style */

const expect = [
["text", "\n\n"],
["opentagstart", { name: "moo", attributes: {} }],
["opentag", {
name: "moo",
attributes: {
a: "12\n 3",
},
isSelfClosing: false,
}],
["text", "\n abc\n def\r\n ghi\n\n xx\nxx\n"],
["closetag", {
name: "moo",
attributes: {
a: "12\n 3",
},
isSelfClosing: false,
}],
["text", "\n"],
];

describe("eol handling", () => {
test({
name: "one chunk",
xml,
expect,
});

test({
name: "char-by-char",
expect,
fn(parser) {
for (const x of xml) {
parser.write(x);
}
parser.close();
},
});
});
Expand Down
22 changes: 20 additions & 2 deletions test/parser-position.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,19 @@ const { test } = require(".");
function testPosition(name, chunks, expectedEvents) {
it(name, () => {
const parser = new saxes.SaxesParser();
for (const expectation of expectedEvents) {
parser[`on${expectation[0]}`] = function handler() {
let expectedIx = 0;
for (const ev of saxes.EVENTS) {
// eslint-disable-next-line no-loop-func
parser[`on${ev}`] = () => {
const expectation = expectedEvents[expectedIx++];
expect(expectation[0]).to.equal(ev);
// eslint-disable-next-line guard-for-in
for (const prop in expectation[1]) {
expect(parser[prop]).to.deep.equal(expectation[1][prop]);
}
};
}

for (const chunk of chunks) {
parser.write(chunk);
}
Expand All @@ -40,6 +45,19 @@ describe("parser position", () => {
["closetag", { position: 19 }],
]);

testPosition(
"with various newlines",
["<div>abcde\r\n<foo/>f\rgh</div>"], [
["opentagstart", { position: 5, line: 1, column: 5 }],
["opentag", { position: 5, line: 1, column: 5 }],
["text", { position: 17, line: 2, column: 5 }],
["opentagstart", { position: 17, line: 2, column: 5 }],
["opentag", { position: 18, line: 2, column: 6 }],
["closetag", { position: 18, line: 2, column: 6 }],
["text", { position: 28, line: 3, column: 8 }],
["closetag", { position: 28, line: 3, column: 8 }],
]);

test({
name: "pi before root",
xml: "",
Expand Down

0 comments on commit d7b1abe

Please sign in to comment.