diff --git a/README.md b/README.md index 7a471dbc..cb20ec41 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,12 @@ Settings supported: is `false`. * `position` - Boolean. If `false`, then don't track line/col/position. Unset is - treated as `true`. Default is unset. + treated as `true`. Default is unset. Currently, setting this to `false` only + results in a cosmetic change: the errors reported do not contain position + information. sax-js would literally turn off the position-computing logic if + this flag was set to false. The notion was that it would optimize + execution. In saxes at least it turns out that continually testing this flag + causes a cost that offsets the benefits of turning off this logic. * `fileName` - String. Set a file name for error reporting. This is useful only when tracking positions. You may leave it unset, in which case the file name @@ -164,8 +169,10 @@ done processing the buffer, which is signaled by the `end` event. The parser has the following properties: -`line`, `column`, `position` - Indications of the position in the XML document -where the parser currently is looking. +`line`, `column`, `columnIndex`, `position` - Indications of the position in the +XML document where the parser currently is looking. The `columnIndex` property +counts columns as if indexing into a JavaScript string, whereas the `column` +property counts Unicode characters. `closed` - Boolean indicating whether or not the parser can be written to. If it's `true`, then wait for the `ready` event to write again. diff --git a/src/saxes.ts b/src/saxes.ts index c598662a..bb17ad3e 100644 --- a/src/saxes.ts +++ b/src/saxes.ts @@ -505,9 +505,24 @@ export class SaxesParser { */ xmlDecl!: XMLDecl; - /** The line number the parser is currently looking at. */ + /** + * The line number of the next character to be read by the parser. This field + * is one-based. (The first line is numbered 1.) + */ line!: number; + /** + * The column number of the next character to be read by the parser. * + * This field is zero-based. (The first column is 0.) + * + * This field counts columns by *Unicode character*. Note that this *can* + * be different from the index of the character in a JavaScript string due + * to how JavaScript handles astral plane characters. + * + * See [[columnIndex]] for a number that corresponds to the JavaScript index. + */ + column!: number; + /** * A map of entity name to expansion. */ @@ -664,18 +679,37 @@ export class SaxesParser { }; this.line = 1; + this.column = 0; this.ENTITIES = Object.create(XML_ENTITIES); this.onready(); } - /** The stream position the parser is currently looking at. */ + /** + * The stream position the parser is currently looking at. This field is + * zero-based. + * + * This field is not based on counting Unicode characters but is to be + * interpreted as a plain index into a JavaScript string. + */ get position(): number { return this.chunkPosition + this.i; } - get column(): number { + /** + * The column number of the next character to be read by the parser. * + * This field is zero-based. (The first column in a line is 0.) + * + * This field reports the index at which the next character would be in the + * line if the line were represented as a JavaScript string. Note that this + * *can* be different to a count based on the number of *Unicode characters* + * due to how JavaScript handles astral plane characters. + * + * See [[column]] for a number that corresponds to a count of Unicode + * characters. + */ + get columnIndex(): number { return this.position - this.positionAtNewLine; } @@ -898,6 +932,7 @@ export class SaxesParser { // than using codePointAt. const code = chunk.charCodeAt(i); + this.column++; if (code < 0xD800) { if (code >= SPACE || code === TAB) { return code; @@ -906,6 +941,7 @@ export class SaxesParser { switch (code) { case NL: this.line++; + this.column = 0; this.positionAtNewLine = this.position; return NL; case CR: @@ -921,6 +957,7 @@ export class SaxesParser { // In either case, \r becomes \n. this.line++; + this.column = 0; this.positionAtNewLine = this.position; return NL_LIKE; default: @@ -978,6 +1015,7 @@ export class SaxesParser { // than using codePointAt. const code = chunk.charCodeAt(i); + this.column++; if (code < 0xD800) { if ((code > 0x1F && code < 0x7F) || (code > 0x9F && code !== LS) || code === TAB) { @@ -987,6 +1025,7 @@ export class SaxesParser { switch (code) { case NL: // 0xA this.line++; + this.column = 0; this.positionAtNewLine = this.position; return NL; case CR: { // 0xD @@ -1004,6 +1043,7 @@ export class SaxesParser { case NEL: // 0x85 case LS: // Ox2028 this.line++; + this.column = 0; this.positionAtNewLine = this.position; return NL_LIKE; default: @@ -1045,6 +1085,11 @@ export class SaxesParser { return c === NL_LIKE ? NL : c; } + private unget(): void { + this.i = this.prevI; + this.column--; + } + /** * Capture characters into a buffer until encountering one of a set of * characters. @@ -1181,6 +1226,7 @@ export class SaxesParser { // If the initial character is 0xFEFF, ignore it. if (this.chunk.charCodeAt(0) === 0xFEFF) { this.i++; + this.column++; } // This initial loop is a specialized version of skipSpaces. We need to know @@ -1216,7 +1262,7 @@ export class SaxesParser { case EOC: break; default: - this.i = this.prevI; + this.unget(); this.state = S_TEXT; this.xmlDeclPossible = false; } @@ -1416,7 +1462,7 @@ export class SaxesParser { // either a /, ?, !, or text is coming next. if (isNameStartChar(c)) { this.state = S_OPEN_TAG; - this.i = this.prevI; + this.unget(); this.xmlDeclPossible = false; } else { @@ -1816,7 +1862,7 @@ export class SaxesParser { if (!isS(c)) { this.fail("whitespace required."); - this.i = this.prevI; + this.unget(); } this.state = S_XML_DECL_NAME_START; @@ -1899,7 +1945,7 @@ export class SaxesParser { return; } if (isNameStartChar(c)) { - this.i = this.prevI; + this.unget(); this.state = S_ATTRIB_NAME; } else if (c === GREATER) { @@ -1950,7 +1996,7 @@ export class SaxesParser { this.openTag(); } else if (isNameStartChar(c)) { - this.i = this.prevI; + this.unget(); this.state = S_ATTRIB_NAME; } else { @@ -1969,7 +2015,7 @@ export class SaxesParser { else if (!isS(c)) { this.fail("unquoted attribute value."); this.state = S_ATTRIB_VALUE_UNQUOTED; - this.i = this.prevI; + this.unget(); } } @@ -2025,7 +2071,7 @@ export class SaxesParser { } else if (isNameStartChar(c)) { this.fail("no whitespace between attributes."); - this.i = this.prevI; + this.unget(); this.state = S_ATTRIB_NAME; } else { diff --git a/test/parser-position.ts b/test/parser-position.ts index b18fa4d6..4066cd88 100644 --- a/test/parser-position.ts +++ b/test/parser-position.ts @@ -102,6 +102,13 @@ describe("parser position", () => { }); }); + testPosition("astral plane", ["💩"], [ + ["opentagstart", { position: 3, line: 1, column: 3 }], + ["opentag", { position: 3, line: 1, column: 3 }], + ["text", { position: 6, line: 1, column: 5 }], + ["closetag", { position: 9, line: 1, column: 8 }], + ]); + test({ name: "empty document", xml: "", diff --git a/test/trailing-non-whitespace.ts b/test/trailing-non-whitespace.ts index 7634700a..0d0ee60f 100644 --- a/test/trailing-non-whitespace.ts +++ b/test/trailing-non-whitespace.ts @@ -19,7 +19,7 @@ test({ attributes: {}, isSelfClosing: false, }], - ["error", "1:37: text data outside of root node."], + ["error", "1:36: text data outside of root node."], ["text", " to monkey land"], ["end", undefined], ["ready", undefined],