From eb602e76466ef383d68aecd944af912ab11abfd4 Mon Sep 17 00:00:00 2001
From: Jonathan Hedley <jonathan@hedley.net>
Date: Mon, 29 Jul 2024 14:10:22 +1000
Subject: [PATCH] Emit onNodeInserted + onNodeClosed for the root document
 (#2182)

Emit onNodeInserted + onNodeClosed for the root document

This enables the source position tracking on the Document node (which was previously unset). Also enables the node traversor to see the outer Document node.
---
 CHANGES.md                                    |  6 +++++-
 .../java/org/jsoup/parser/TreeBuilder.java    |  9 ++++++++-
 .../java/org/jsoup/parser/PositionTest.java   | 16 +++++++++++++++
 .../org/jsoup/parser/StreamParserTest.java    | 20 +++++++++----------
 .../java/org/jsoup/safety/CleanerTest.java    |  4 +++-
 5 files changed, 42 insertions(+), 13 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index 059c903ac4..9db0887ac1 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -3,8 +3,12 @@
 ## 1.18.2 (Pending)
 
 ### Improvements
-* The form associated elements returned by `FormElement.elements()` now reflect changes made to the DOM, 
+
+* The form associated elements returned by `FormElement.elements()` now reflect changes made to the DOM,
   subsequently to the original parse. [2140](https://github.com/jhy/jsoup/issues/2140)
+* In the `TreeBuilder`, the `onNodeInserted()` and `onNodeClosed()` events are now also fired for the outermost /
+  root `Document` node. This enables source position tracking on the Document node (which was previously unset). And
+  it also enables the node traversor to see the outer Document node. [2182](https://github.com/jhy/jsoup/pull/2182)
 
 ### Bug Fixes
 
diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java
index fb5c0708fa..89ba82701b 100644
--- a/src/main/java/org/jsoup/parser/TreeBuilder.java
+++ b/src/main/java/org/jsoup/parser/TreeBuilder.java
@@ -58,6 +58,7 @@ void initialiseParse(Reader input, String baseUri, Parser parser) {
         start = new Token.StartTag(this);
         currentToken = start; // init current token to the virtual start token.
         this.baseUri = baseUri;
+        onNodeInserted(doc);
     }
 
     void completeParse() {
@@ -108,7 +109,13 @@ void runParser() {
     boolean stepParser() {
         // if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks:
         if (currentToken.type == Token.TokenType.EOF) {
-            if (stack == null || stack.isEmpty()) return false; // stack will be null if TB was closed, as in case of runParser() + completeFragment()
+            if (stack == null) {
+                return false;
+            } if (stack.isEmpty()) {
+                onNodeClosed(doc); // the root doc is not on the stack, so let this final step close it
+                stack = null;
+                return true;
+            }
             pop();
             return true;
         }
diff --git a/src/test/java/org/jsoup/parser/PositionTest.java b/src/test/java/org/jsoup/parser/PositionTest.java
index e809809583..9dd42971a7 100644
--- a/src/test/java/org/jsoup/parser/PositionTest.java
+++ b/src/test/java/org/jsoup/parser/PositionTest.java
@@ -519,6 +519,22 @@ private void printRange(Node node) {
         assertEquals("class=\"On\"", attr.html());
     }
 
+    @Test void tracksDocument() {
+        String html = "<!doctype html><title>Foo</title><p>Bar.";
+        Document doc = Jsoup.parse(html, TrackingHtmlParser);
+        StringBuilder track = new StringBuilder();
+        doc.forEachNode(node -> accumulatePositions(node, track));
+        assertEquals("#document:0-0~40-40; #doctype:0-15; html:15-15~40-40; head:15-15~33-33; title:15-22~15-33; #text:22-25; body:33-33~40-40; p:33-36~40-40; #text:36-40; ", track.toString());
+    }
+
+    @Test void tracksDocumentXml() {
+        String html = "<!doctype html><title>Foo</title><p>Bar.";
+        Document doc = Jsoup.parse(html, TrackingXmlParser);
+        StringBuilder track = new StringBuilder();
+        doc.forEachNode(node -> accumulatePositions(node, track));
+        assertEquals("#document:0-0~40-40; #doctype:0-15; title:15-22~25-33; #text:22-25; p:33-36~40-40; #text:36-40; ", track.toString());
+    }
+
     @Test void updateKeyMaintainsRangeUc() {
         String html = "<p xsi:CLASS=On>One</p>";
         Document doc = Jsoup.parse(html, TrackingXmlParser);
diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java
index bbbd620c59..cebab0c424 100644
--- a/src/test/java/org/jsoup/parser/StreamParserTest.java
+++ b/src/test/java/org/jsoup/parser/StreamParserTest.java
@@ -36,7 +36,7 @@ void canStream() {
             StringBuilder seen;
             seen = new StringBuilder();
             parser.stream().forEachOrdered(el -> trackSeen(el, seen));
-            assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString());
+            assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;#root;", seen.toString());
             // checks expected order, and the + indicates that element had a next sibling at time of emission
         }
     }
@@ -48,7 +48,7 @@ void canStreamXml() {
             StringBuilder seen;
             seen = new StringBuilder();
             parser.stream().forEachOrdered(el -> trackSeen(el, seen));
-            assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;", seen.toString());
+            assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;#root;", seen.toString());
             // checks expected order, and the + indicates that element had a next sibling at time of emission
         }
     }
@@ -64,7 +64,7 @@ void canStreamXml() {
             trackSeen(it.next(), seen);
         }
 
-        assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString());
+        assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;#root;", seen.toString());
         // checks expected order, and the + indicates that element had a next sibling at time of emission
     }
 
@@ -75,13 +75,13 @@ void canStreamXml() {
 
         StringBuilder seen = new StringBuilder();
         parser.stream().forEach(el -> trackSeen(el, seen));
-        assertEquals("head+;p[One]+;p[Two];body;html;", seen.toString());
+        assertEquals("head+;p[One]+;p[Two];body;html;#root;", seen.toString());
 
         String html2 = "<div>Three<div>Four</div></div>";
         StringBuilder seen2 = new StringBuilder();
         parser.parse(html2, "");
         parser.stream().forEach(el -> trackSeen(el, seen2));
-        assertEquals("head+;div[Four];div[Three];body;html;", seen2.toString());
+        assertEquals("head+;div[Four];div[Three];body;html;#root;", seen2.toString());
 
         // re-run without a new parse should be empty
         StringBuilder seen3 = new StringBuilder();
@@ -247,7 +247,7 @@ static void trackSeen(Element el, StringBuilder actual) {
         StreamParser streamer = basic();
         assertFalse(isClosed(streamer));
         long count = streamer.stream().count();
-        assertEquals(6, count);
+        assertEquals(7, count);
 
         assertTrue(isClosed(streamer));
     }
@@ -261,7 +261,7 @@ static void trackSeen(Element el, StringBuilder actual) {
             it.next();
             count++;
         }
-        assertEquals(6, count);
+        assertEquals(7, count);
         assertTrue(isClosed(streamer));
     }
 
@@ -384,7 +384,7 @@ void canStreamFragment() {
         try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) {
             StringBuilder seen = new StringBuilder();
             parser.stream().forEachOrdered(el -> trackSeen(el, seen));
-            assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString());
+            assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;#root;", seen.toString());
             // checks expected order, and the + indicates that element had a next sibling at time of emission
             // note that we don't get a full doc, just the fragment (and the context at the end of the stack)
 
@@ -405,7 +405,7 @@ void canStreamFragment() {
                 trackSeen(it.next(), seen);
             }
 
-            assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString());
+            assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;#root;", seen.toString());
             // checks expected order, and the + indicates that element had a next sibling at time of emission
             // note that we don't get a full doc, just the fragment (and the context at the end of the stack)
 
@@ -451,7 +451,7 @@ void canStreamFragmentXml() throws IOException {
         try (StreamParser parser = new StreamParser(Parser.xmlParser()).parseFragment(html, context, "")) {
             StringBuilder seen = new StringBuilder();
             parser.stream().forEachOrdered(el -> trackSeen(el, seen));
-            assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;", seen.toString());
+            assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;#root;", seen.toString());
             // checks expected order, and the + indicates that element had a next sibling at time of emission
             // note that we don't get a full doc, just the fragment
 
diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java
index 961a7636c2..078d80fa73 100644
--- a/src/test/java/org/jsoup/safety/CleanerTest.java
+++ b/src/test/java/org/jsoup/safety/CleanerTest.java
@@ -395,6 +395,8 @@ public void bailsIfRemovingProtocolThatsNotSet() {
         Element p = orig.expectFirst("p");
         Range origRange = p.sourceRange();
         assertEquals("2,2:22-2,10:30", origRange.toString());
+        assertEquals("1,1:0-1,1:0", orig.sourceRange().toString());
+        assertEquals("2,19:39-2,19:39", orig.endSourceRange().toString());
 
         Range.AttributeRange attributeRange = p.attributes().sourceRange("id");
         assertEquals("2,5:25-2,7:27=2,8:28-2,9:29", attributeRange.toString());
@@ -404,7 +406,7 @@ public void bailsIfRemovingProtocolThatsNotSet() {
         assertEquals("1", cleanP.id());
         Range cleanRange = cleanP.sourceRange();
         assertEquals(origRange, cleanRange);
-        assertEquals(orig.endSourceRange(), clean.endSourceRange());
+        assertEquals(p.endSourceRange(), cleanP.endSourceRange());
         assertEquals(attributeRange, cleanP.attributes().sourceRange("id"));
     }