From eb602e76466ef383d68aecd944af912ab11abfd4 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 29 Jul 2024 14:10:22 +1000 Subject: [PATCH] Emit onNodeInserted + onNodeClosed for the root document (#2182) Emit onNodeInserted + onNodeClosed for the root document This enables the source position tracking on the Document node (which was previously unset). Also enables the node traversor to see the outer Document node. --- CHANGES.md | 6 +++++- .../java/org/jsoup/parser/TreeBuilder.java | 9 ++++++++- .../java/org/jsoup/parser/PositionTest.java | 16 +++++++++++++++ .../org/jsoup/parser/StreamParserTest.java | 20 +++++++++---------- .../java/org/jsoup/safety/CleanerTest.java | 4 +++- 5 files changed, 42 insertions(+), 13 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 059c903ac4..9db0887ac1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,8 +3,12 @@ ## 1.18.2 (Pending) ### Improvements -* The form associated elements returned by `FormElement.elements()` now reflect changes made to the DOM, + +* The form associated elements returned by `FormElement.elements()` now reflect changes made to the DOM, subsequently to the original parse. [2140](https://github.com/jhy/jsoup/issues/2140) +* In the `TreeBuilder`, the `onNodeInserted()` and `onNodeClosed()` events are now also fired for the outermost / + root `Document` node. This enables source position tracking on the Document node (which was previously unset). And + it also enables the node traversor to see the outer Document node. [2182](https://github.com/jhy/jsoup/pull/2182) ### Bug Fixes diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index fb5c0708fa..89ba82701b 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -58,6 +58,7 @@ void initialiseParse(Reader input, String baseUri, Parser parser) { start = new Token.StartTag(this); currentToken = start; // init current token to the virtual start token. this.baseUri = baseUri; + onNodeInserted(doc); } void completeParse() { @@ -108,7 +109,13 @@ void runParser() { boolean stepParser() { // if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks: if (currentToken.type == Token.TokenType.EOF) { - if (stack == null || stack.isEmpty()) return false; // stack will be null if TB was closed, as in case of runParser() + completeFragment() + if (stack == null) { + return false; + } if (stack.isEmpty()) { + onNodeClosed(doc); // the root doc is not on the stack, so let this final step close it + stack = null; + return true; + } pop(); return true; } diff --git a/src/test/java/org/jsoup/parser/PositionTest.java b/src/test/java/org/jsoup/parser/PositionTest.java index e809809583..9dd42971a7 100644 --- a/src/test/java/org/jsoup/parser/PositionTest.java +++ b/src/test/java/org/jsoup/parser/PositionTest.java @@ -519,6 +519,22 @@ private void printRange(Node node) { assertEquals("class=\"On\"", attr.html()); } + @Test void tracksDocument() { + String html = "Foo

Bar."; + Document doc = Jsoup.parse(html, TrackingHtmlParser); + StringBuilder track = new StringBuilder(); + doc.forEachNode(node -> accumulatePositions(node, track)); + assertEquals("#document:0-0~40-40; #doctype:0-15; html:15-15~40-40; head:15-15~33-33; title:15-22~15-33; #text:22-25; body:33-33~40-40; p:33-36~40-40; #text:36-40; ", track.toString()); + } + + @Test void tracksDocumentXml() { + String html = "Foo

Bar."; + Document doc = Jsoup.parse(html, TrackingXmlParser); + StringBuilder track = new StringBuilder(); + doc.forEachNode(node -> accumulatePositions(node, track)); + assertEquals("#document:0-0~40-40; #doctype:0-15; title:15-22~25-33; #text:22-25; p:33-36~40-40; #text:36-40; ", track.toString()); + } + @Test void updateKeyMaintainsRangeUc() { String html = "

One

"; Document doc = Jsoup.parse(html, TrackingXmlParser); diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index bbbd620c59..cebab0c424 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -36,7 +36,7 @@ void canStream() { StringBuilder seen; seen = new StringBuilder(); parser.stream().forEachOrdered(el -> trackSeen(el, seen)); - assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString()); + assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;#root;", seen.toString()); // checks expected order, and the + indicates that element had a next sibling at time of emission } } @@ -48,7 +48,7 @@ void canStreamXml() { StringBuilder seen; seen = new StringBuilder(); parser.stream().forEachOrdered(el -> trackSeen(el, seen)); - assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;", seen.toString()); + assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;#root;", seen.toString()); // checks expected order, and the + indicates that element had a next sibling at time of emission } } @@ -64,7 +64,7 @@ void canStreamXml() { trackSeen(it.next(), seen); } - assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString()); + assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;#root;", seen.toString()); // checks expected order, and the + indicates that element had a next sibling at time of emission } @@ -75,13 +75,13 @@ void canStreamXml() { StringBuilder seen = new StringBuilder(); parser.stream().forEach(el -> trackSeen(el, seen)); - assertEquals("head+;p[One]+;p[Two];body;html;", seen.toString()); + assertEquals("head+;p[One]+;p[Two];body;html;#root;", seen.toString()); String html2 = "
Three
Four
"; StringBuilder seen2 = new StringBuilder(); parser.parse(html2, ""); parser.stream().forEach(el -> trackSeen(el, seen2)); - assertEquals("head+;div[Four];div[Three];body;html;", seen2.toString()); + assertEquals("head+;div[Four];div[Three];body;html;#root;", seen2.toString()); // re-run without a new parse should be empty StringBuilder seen3 = new StringBuilder(); @@ -247,7 +247,7 @@ static void trackSeen(Element el, StringBuilder actual) { StreamParser streamer = basic(); assertFalse(isClosed(streamer)); long count = streamer.stream().count(); - assertEquals(6, count); + assertEquals(7, count); assertTrue(isClosed(streamer)); } @@ -261,7 +261,7 @@ static void trackSeen(Element el, StringBuilder actual) { it.next(); count++; } - assertEquals(6, count); + assertEquals(7, count); assertTrue(isClosed(streamer)); } @@ -384,7 +384,7 @@ void canStreamFragment() { try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) { StringBuilder seen = new StringBuilder(); parser.stream().forEachOrdered(el -> trackSeen(el, seen)); - assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString()); + assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;#root;", seen.toString()); // checks expected order, and the + indicates that element had a next sibling at time of emission // note that we don't get a full doc, just the fragment (and the context at the end of the stack) @@ -405,7 +405,7 @@ void canStreamFragment() { trackSeen(it.next(), seen); } - assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString()); + assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;#root;", seen.toString()); // checks expected order, and the + indicates that element had a next sibling at time of emission // note that we don't get a full doc, just the fragment (and the context at the end of the stack) @@ -451,7 +451,7 @@ void canStreamFragmentXml() throws IOException { try (StreamParser parser = new StreamParser(Parser.xmlParser()).parseFragment(html, context, "")) { StringBuilder seen = new StringBuilder(); parser.stream().forEachOrdered(el -> trackSeen(el, seen)); - assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;", seen.toString()); + assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;#root;", seen.toString()); // checks expected order, and the + indicates that element had a next sibling at time of emission // note that we don't get a full doc, just the fragment diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java index 961a7636c2..078d80fa73 100644 --- a/src/test/java/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/safety/CleanerTest.java @@ -395,6 +395,8 @@ public void bailsIfRemovingProtocolThatsNotSet() { Element p = orig.expectFirst("p"); Range origRange = p.sourceRange(); assertEquals("2,2:22-2,10:30", origRange.toString()); + assertEquals("1,1:0-1,1:0", orig.sourceRange().toString()); + assertEquals("2,19:39-2,19:39", orig.endSourceRange().toString()); Range.AttributeRange attributeRange = p.attributes().sourceRange("id"); assertEquals("2,5:25-2,7:27=2,8:28-2,9:29", attributeRange.toString()); @@ -404,7 +406,7 @@ public void bailsIfRemovingProtocolThatsNotSet() { assertEquals("1", cleanP.id()); Range cleanRange = cleanP.sourceRange(); assertEquals(origRange, cleanRange); - assertEquals(orig.endSourceRange(), clean.endSourceRange()); + assertEquals(p.endSourceRange(), cleanP.endSourceRange()); assertEquals(attributeRange, cleanP.attributes().sourceRange("id")); }