diff --git a/README.md b/README.md index 845d04fc..98593b4f 100644 --- a/README.md +++ b/README.md @@ -247,6 +247,18 @@ want. `additionalNamespaces` applies before `resolvePrefix`. converted to ``\u000A`` prior to parsing. The optimal code path for saxes is a file in which all end of line characters are already ``\u000A``. +* Don't split Unicode strings you feed to saxes across surrogates. When you + naively split a string in JavaScript, you run the risk of splitting a Unicode + character into two surrogates. e.g. In the following example ``a`` and ``b`` + each contain half of a single Unicode character: ``const a = "\u{1F4A9}"[0]; + const b = "\u{1F4A9}"[1]`` If you feed such split surrogates to versions of + saxes prior to 4, you'd get errors. Saxes version 4 and over are able to + detect when a chunk of data ends with a surrogate and carry over the surrogate + to the next chunk. However this operation entails slicing and concatenating + strings. If you can feed your data in a way that does not split surrogates, + you should do it. (Obviously, feeding all the data at once with a single write + is fastest.) + ## FAQ Q. Why has saxes dropped support for limiting the size of data chunks passed to diff --git a/lib/saxes.js b/lib/saxes.js index 8907f1fb..6834994d 100644 --- a/lib/saxes.js +++ b/lib/saxes.js @@ -344,7 +344,7 @@ class SaxesParser { // effects. // this.prevI = 0; - this.trailingCR = false; + this.carriedFromPrevious = undefined; this.originalNL = true; this.forbiddenState = FORBIDDEN_START; /** @@ -573,20 +573,24 @@ class SaxesParser { // isn't. (There may be Node-specific code that would perform faster than // ``Array.from`` but don't want to be dependent on Node.) - if (this.trailingCR) { - // The previous chunk had a trailing cr. We need to handle it now. - chunk = `\r${chunk}`; - this.trailingCR = false; + if (this.carriedFromPrevious !== undefined) { + // The previous chunk had char we must carry over. + chunk = `${this.carriedFromPrevious}${chunk}`; + this.carriedFromPrevious = undefined; } let limit = chunk.length; - if (!end && chunk[limit - 1] === "\r") { - // The chunk ends with a trailing CR. We cannot know how to handle it - // until we get the next chunk or the end of the stream. So save it for - // later. + const lastCode = chunk.charCodeAt(limit - 1); + if (!end && + // A trailing CR or surrogate must be carried over to the next + // chunk. + (lastCode === CR || (lastCode >= 0xD800 && lastCode <= 0xDBFF))) { + // The chunk ends with a character that must be carried over. We cannot + // know how to handle it until we get the next chunk or the end of the + // stream. So save it for later. + this.carriedFromPrevious = chunk[limit - 1]; limit--; chunk = chunk.slice(0, limit); - this.trailingCR = true; } this.chunk = chunk; diff --git a/test/unicode.js b/test/unicode.js new file mode 100644 index 00000000..01eb8c59 --- /dev/null +++ b/test/unicode.js @@ -0,0 +1,32 @@ +"use strict"; + +const { test } = require("."); + +describe("unicode test", () => { + describe("poop", () => { + const xml = "💩"; + const expect = [ + ["opentagstart", { name: "a", attributes: {} }], + ["opentag", { name: "a", attributes: {}, isSelfClosing: false }], + ["text", "💩"], + ["closetag", { name: "a", attributes: {}, isSelfClosing: false }], + ]; + + test({ + name: "intact", + xml, + expect, + }); + + test({ + name: "sliced", + fn(parser) { + // This test purposely slices the string into the poop character. + parser.write(xml.slice(0, 4)); + parser.write(xml.slice(4)); + parser.close(); + }, + expect, + }); + }); +});