diff --git a/lib/saxes.js b/lib/saxes.js
index 011dead6..5d76168f 100644
--- a/lib/saxes.js
+++ b/lib/saxes.js
@@ -83,6 +83,7 @@ exports.EVENTS = [
];
const NL = 0xA;
+const CR = 0xD;
const SPACE = 0x20;
const BANG = 0x21;
const DQUOTE = 0x22;
@@ -105,7 +106,7 @@ function isQuote(c) {
}
const QUOTES = [DQUOTE, SQUOTE];
-const S = [SPACE, NL, 0xD, 9];
+const S = [SPACE, NL, CR, 9];
const TEXT_TERMINATOR = [LESS, AMP];
const DOCTYPE_TERMINATOR = [...QUOTES, OPEN_BRACKET, GREATER];
@@ -305,6 +306,7 @@ class SaxesParser {
this.chunk = "";
this.chunkPosition = 0;
this.i = 0;
+ this.trailingCR = false;
/**
* A map of entity name to expansion.
*
@@ -479,9 +481,13 @@ class SaxesParser {
if (this.closed) {
return this.fail("cannot write after close; assign an onready handler.");
}
+
+ let end = false;
if (chunk === null) {
- return this.end();
+ end = true;
+ chunk = "";
}
+
if (typeof chunk === "object") {
chunk = chunk.toString();
}
@@ -491,7 +497,23 @@ class SaxesParser {
// than the current repeated calls to ``codePointAt``. As of August 2018, it
// isn't. (There may be Node-specific code that would perform faster than
// ``Array.from`` but don't want to be dependent on Node.)
- const limit = this.limit = chunk.length;
+
+ let limit = chunk.length;
+
+ if (this.trailingCR) {
+ // The previous chunk had a trailing cr. We need to handle it now.
+ chunk = `\r${chunk}`;
+ }
+
+ if (!end && chunk[limit - 1] === CR) {
+ // The chunk ends with a trailing CR. We cannot know how to handle it
+ // until we get the next chunk or the end of the stream. So save it for
+ // later.
+ limit--;
+ this.trailingCR = true;
+ }
+ this.limit = limit;
+
this.chunk = chunk;
this.i = 0;
while (this.i < limit) {
@@ -499,7 +521,7 @@ class SaxesParser {
}
this.chunkPosition += limit;
- return this;
+ return end ? this.end() : this;
}
/**
@@ -521,12 +543,29 @@ class SaxesParser {
* @returns {number} The character read.
*/
getCode() {
- const { chunk, i } = this;
+ const { chunk } = this;
+ let { i } = this;
// Using charCodeAt and handling the surrogates ourselves is faster
// than using codePointAt.
let code = chunk.charCodeAt(i);
let skip = 1;
+ if (code === CR) {
+ // We may get undefined if we read past the end of the chunk, which is
+ // fine.
+ const next = chunk.charCodeAt(i + 1);
+ if (next === NL) {
+ // A \r\n sequence is converted to \n so we have to skip over the next
+ // character. We already know it has a size of 1 so ++ is fine here.
+ i++;
+ }
+ // Otherwise, a \r is just converted to \n, so we don't have to skip
+ // ahead.
+
+ // In either case, \r becomes \n.
+ code = NL;
+ }
+
if (code === NL) {
this.line++;
this.column = 0;
diff --git a/test/eol-handling.js b/test/eol-handling.js
new file mode 100644
index 00000000..8a07e9d3
--- /dev/null
+++ b/test/eol-handling.js
@@ -0,0 +1,58 @@
+"use strict";
+
+const { test } = require(".");
+
+/* eslint-disable linebreak-style */
+const xml = `\
+
+
+