From cefc8f7e2a9a78f2abcae9385a3c9f8528cb4276 Mon Sep 17 00:00:00 2001 From: Louis-Dominique Dubeau Date: Thu, 9 Jan 2020 10:55:27 -0500 Subject: [PATCH] fix: handle column computation over characters in the astral plane The column numbers were reported as JavaScript string indexes. This is wrong if any astral plane character is present in the line. This commit fixes the issue. BREAKING CHANGE: The fix to column number reporting changes the meaning of the ``column`` field. If you need the old behavior of ``column`` you can use the new ``columnIndex`` field which behaves like the old ``column`` and may be useful in some contexts. Ultimately you should decide whether your application needs to know column numbers by Unicode character count or by JavaScript index. (And you need to know the difference between the two. You can see [this page](https://mathiasbynens.be/notes/javascript-unicode) for a detailed discussion of the Unicode problem in JavaScript. Note that the numbers put in the error messages that ``fail`` produce are still based on the ``column`` field and thus use the new meaning of ``column``. If you want error message that use ``columnIndex`` you may override the ``fail`` method. --- README.md | 13 +++++-- src/saxes.ts | 66 ++++++++++++++++++++++++++++----- test/parser-position.ts | 7 ++++ test/trailing-non-whitespace.ts | 2 +- 4 files changed, 74 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 7a471dbc..cb20ec41 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,12 @@ Settings supported: is `false`. * `position` - Boolean. If `false`, then don't track line/col/position. Unset is - treated as `true`. Default is unset. + treated as `true`. Default is unset. Currently, setting this to `false` only + results in a cosmetic change: the errors reported do not contain position + information. sax-js would literally turn off the position-computing logic if + this flag was set to false. The notion was that it would optimize + execution. In saxes at least it turns out that continually testing this flag + causes a cost that offsets the benefits of turning off this logic. * `fileName` - String. Set a file name for error reporting. This is useful only when tracking positions. You may leave it unset, in which case the file name @@ -164,8 +169,10 @@ done processing the buffer, which is signaled by the `end` event. The parser has the following properties: -`line`, `column`, `position` - Indications of the position in the XML document -where the parser currently is looking. +`line`, `column`, `columnIndex`, `position` - Indications of the position in the +XML document where the parser currently is looking. The `columnIndex` property +counts columns as if indexing into a JavaScript string, whereas the `column` +property counts Unicode characters. `closed` - Boolean indicating whether or not the parser can be written to. If it's `true`, then wait for the `ready` event to write again. diff --git a/src/saxes.ts b/src/saxes.ts index c598662a..bb17ad3e 100644 --- a/src/saxes.ts +++ b/src/saxes.ts @@ -505,9 +505,24 @@ export class SaxesParser { */ xmlDecl!: XMLDecl; - /** The line number the parser is currently looking at. */ + /** + * The line number of the next character to be read by the parser. This field + * is one-based. (The first line is numbered 1.) + */ line!: number; + /** + * The column number of the next character to be read by the parser. * + * This field is zero-based. (The first column is 0.) + * + * This field counts columns by *Unicode character*. Note that this *can* + * be different from the index of the character in a JavaScript string due + * to how JavaScript handles astral plane characters. + * + * See [[columnIndex]] for a number that corresponds to the JavaScript index. + */ + column!: number; + /** * A map of entity name to expansion. */ @@ -664,18 +679,37 @@ export class SaxesParser { }; this.line = 1; + this.column = 0; this.ENTITIES = Object.create(XML_ENTITIES); this.onready(); } - /** The stream position the parser is currently looking at. */ + /** + * The stream position the parser is currently looking at. This field is + * zero-based. + * + * This field is not based on counting Unicode characters but is to be + * interpreted as a plain index into a JavaScript string. + */ get position(): number { return this.chunkPosition + this.i; } - get column(): number { + /** + * The column number of the next character to be read by the parser. * + * This field is zero-based. (The first column in a line is 0.) + * + * This field reports the index at which the next character would be in the + * line if the line were represented as a JavaScript string. Note that this + * *can* be different to a count based on the number of *Unicode characters* + * due to how JavaScript handles astral plane characters. + * + * See [[column]] for a number that corresponds to a count of Unicode + * characters. + */ + get columnIndex(): number { return this.position - this.positionAtNewLine; } @@ -898,6 +932,7 @@ export class SaxesParser { // than using codePointAt. const code = chunk.charCodeAt(i); + this.column++; if (code < 0xD800) { if (code >= SPACE || code === TAB) { return code; @@ -906,6 +941,7 @@ export class SaxesParser { switch (code) { case NL: this.line++; + this.column = 0; this.positionAtNewLine = this.position; return NL; case CR: @@ -921,6 +957,7 @@ export class SaxesParser { // In either case, \r becomes \n. this.line++; + this.column = 0; this.positionAtNewLine = this.position; return NL_LIKE; default: @@ -978,6 +1015,7 @@ export class SaxesParser { // than using codePointAt. const code = chunk.charCodeAt(i); + this.column++; if (code < 0xD800) { if ((code > 0x1F && code < 0x7F) || (code > 0x9F && code !== LS) || code === TAB) { @@ -987,6 +1025,7 @@ export class SaxesParser { switch (code) { case NL: // 0xA this.line++; + this.column = 0; this.positionAtNewLine = this.position; return NL; case CR: { // 0xD @@ -1004,6 +1043,7 @@ export class SaxesParser { case NEL: // 0x85 case LS: // Ox2028 this.line++; + this.column = 0; this.positionAtNewLine = this.position; return NL_LIKE; default: @@ -1045,6 +1085,11 @@ export class SaxesParser { return c === NL_LIKE ? NL : c; } + private unget(): void { + this.i = this.prevI; + this.column--; + } + /** * Capture characters into a buffer until encountering one of a set of * characters. @@ -1181,6 +1226,7 @@ export class SaxesParser { // If the initial character is 0xFEFF, ignore it. if (this.chunk.charCodeAt(0) === 0xFEFF) { this.i++; + this.column++; } // This initial loop is a specialized version of skipSpaces. We need to know @@ -1216,7 +1262,7 @@ export class SaxesParser { case EOC: break; default: - this.i = this.prevI; + this.unget(); this.state = S_TEXT; this.xmlDeclPossible = false; } @@ -1416,7 +1462,7 @@ export class SaxesParser { // either a /, ?, !, or text is coming next. if (isNameStartChar(c)) { this.state = S_OPEN_TAG; - this.i = this.prevI; + this.unget(); this.xmlDeclPossible = false; } else { @@ -1816,7 +1862,7 @@ export class SaxesParser { if (!isS(c)) { this.fail("whitespace required."); - this.i = this.prevI; + this.unget(); } this.state = S_XML_DECL_NAME_START; @@ -1899,7 +1945,7 @@ export class SaxesParser { return; } if (isNameStartChar(c)) { - this.i = this.prevI; + this.unget(); this.state = S_ATTRIB_NAME; } else if (c === GREATER) { @@ -1950,7 +1996,7 @@ export class SaxesParser { this.openTag(); } else if (isNameStartChar(c)) { - this.i = this.prevI; + this.unget(); this.state = S_ATTRIB_NAME; } else { @@ -1969,7 +2015,7 @@ export class SaxesParser { else if (!isS(c)) { this.fail("unquoted attribute value."); this.state = S_ATTRIB_VALUE_UNQUOTED; - this.i = this.prevI; + this.unget(); } } @@ -2025,7 +2071,7 @@ export class SaxesParser { } else if (isNameStartChar(c)) { this.fail("no whitespace between attributes."); - this.i = this.prevI; + this.unget(); this.state = S_ATTRIB_NAME; } else { diff --git a/test/parser-position.ts b/test/parser-position.ts index b18fa4d6..4066cd88 100644 --- a/test/parser-position.ts +++ b/test/parser-position.ts @@ -102,6 +102,13 @@ describe("parser position", () => { }); }); + testPosition("astral plane", ["💩"], [ + ["opentagstart", { position: 3, line: 1, column: 3 }], + ["opentag", { position: 3, line: 1, column: 3 }], + ["text", { position: 6, line: 1, column: 5 }], + ["closetag", { position: 9, line: 1, column: 8 }], + ]); + test({ name: "empty document", xml: "", diff --git a/test/trailing-non-whitespace.ts b/test/trailing-non-whitespace.ts index 7634700a..0d0ee60f 100644 --- a/test/trailing-non-whitespace.ts +++ b/test/trailing-non-whitespace.ts @@ -19,7 +19,7 @@ test({ attributes: {}, isSelfClosing: false, }], - ["error", "1:37: text data outside of root node."], + ["error", "1:36: text data outside of root node."], ["text", " to monkey land"], ["end", undefined], ["ready", undefined],