Skip to content

Commit

Permalink
Emit onNodeInserted + onNodeClosed for the root document (#2182)
Browse files Browse the repository at this point in the history
Emit onNodeInserted + onNodeClosed for the root document

This enables the source position tracking on the Document node (which was previously unset). Also enables the node traversor to see the outer Document node.
  • Loading branch information
jhy authored Jul 29, 2024
1 parent f5244dc commit eb602e7
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 13 deletions.
6 changes: 5 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@
## 1.18.2 (Pending)

### Improvements
* The form associated elements returned by `FormElement.elements()` now reflect changes made to the DOM,

* The form associated elements returned by `FormElement.elements()` now reflect changes made to the DOM,
subsequently to the original parse. [2140](https://github.com/jhy/jsoup/issues/2140)
* In the `TreeBuilder`, the `onNodeInserted()` and `onNodeClosed()` events are now also fired for the outermost /
root `Document` node. This enables source position tracking on the Document node (which was previously unset). And
it also enables the node traversor to see the outer Document node. [2182](https://github.com/jhy/jsoup/pull/2182)

### Bug Fixes

Expand Down
9 changes: 8 additions & 1 deletion src/main/java/org/jsoup/parser/TreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void initialiseParse(Reader input, String baseUri, Parser parser) {
start = new Token.StartTag(this);
currentToken = start; // init current token to the virtual start token.
this.baseUri = baseUri;
onNodeInserted(doc);
}

void completeParse() {
Expand Down Expand Up @@ -108,7 +109,13 @@ void runParser() {
boolean stepParser() {
// if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks:
if (currentToken.type == Token.TokenType.EOF) {
if (stack == null || stack.isEmpty()) return false; // stack will be null if TB was closed, as in case of runParser() + completeFragment()
if (stack == null) {
return false;
} if (stack.isEmpty()) {
onNodeClosed(doc); // the root doc is not on the stack, so let this final step close it
stack = null;
return true;
}
pop();
return true;
}
Expand Down
16 changes: 16 additions & 0 deletions src/test/java/org/jsoup/parser/PositionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,22 @@ private void printRange(Node node) {
assertEquals("class=\"On\"", attr.html());
}

@Test void tracksDocument() {
String html = "<!doctype html><title>Foo</title><p>Bar.";
Document doc = Jsoup.parse(html, TrackingHtmlParser);
StringBuilder track = new StringBuilder();
doc.forEachNode(node -> accumulatePositions(node, track));
assertEquals("#document:0-0~40-40; #doctype:0-15; html:15-15~40-40; head:15-15~33-33; title:15-22~15-33; #text:22-25; body:33-33~40-40; p:33-36~40-40; #text:36-40; ", track.toString());
}

@Test void tracksDocumentXml() {
String html = "<!doctype html><title>Foo</title><p>Bar.";
Document doc = Jsoup.parse(html, TrackingXmlParser);
StringBuilder track = new StringBuilder();
doc.forEachNode(node -> accumulatePositions(node, track));
assertEquals("#document:0-0~40-40; #doctype:0-15; title:15-22~25-33; #text:22-25; p:33-36~40-40; #text:36-40; ", track.toString());
}

@Test void updateKeyMaintainsRangeUc() {
String html = "<p xsi:CLASS=On>One</p>";
Document doc = Jsoup.parse(html, TrackingXmlParser);
Expand Down
20 changes: 10 additions & 10 deletions src/test/java/org/jsoup/parser/StreamParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ void canStream() {
StringBuilder seen;
seen = new StringBuilder();
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString());
assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;#root;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
}
}
Expand All @@ -48,7 +48,7 @@ void canStreamXml() {
StringBuilder seen;
seen = new StringBuilder();
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;", seen.toString());
assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;#root;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
}
}
Expand All @@ -64,7 +64,7 @@ void canStreamXml() {
trackSeen(it.next(), seen);
}

assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;", seen.toString());
assertEquals("title[Test];head+;div#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];body;html;#root;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
}

Expand All @@ -75,13 +75,13 @@ void canStreamXml() {

StringBuilder seen = new StringBuilder();
parser.stream().forEach(el -> trackSeen(el, seen));
assertEquals("head+;p[One]+;p[Two];body;html;", seen.toString());
assertEquals("head+;p[One]+;p[Two];body;html;#root;", seen.toString());

String html2 = "<div>Three<div>Four</div></div>";
StringBuilder seen2 = new StringBuilder();
parser.parse(html2, "");
parser.stream().forEach(el -> trackSeen(el, seen2));
assertEquals("head+;div[Four];div[Three];body;html;", seen2.toString());
assertEquals("head+;div[Four];div[Three];body;html;#root;", seen2.toString());

// re-run without a new parse should be empty
StringBuilder seen3 = new StringBuilder();
Expand Down Expand Up @@ -247,7 +247,7 @@ static void trackSeen(Element el, StringBuilder actual) {
StreamParser streamer = basic();
assertFalse(isClosed(streamer));
long count = streamer.stream().count();
assertEquals(6, count);
assertEquals(7, count);

assertTrue(isClosed(streamer));
}
Expand All @@ -261,7 +261,7 @@ static void trackSeen(Element el, StringBuilder actual) {
it.next();
count++;
}
assertEquals(6, count);
assertEquals(7, count);
assertTrue(isClosed(streamer));
}

Expand Down Expand Up @@ -384,7 +384,7 @@ void canStreamFragment() {
try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) {
StringBuilder seen = new StringBuilder();
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString());
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;#root;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
// note that we don't get a full doc, just the fragment (and the context at the end of the stack)

Expand All @@ -405,7 +405,7 @@ void canStreamFragment() {
trackSeen(it.next(), seen);
}

assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString());
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;#root;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
// note that we don't get a full doc, just the fragment (and the context at the end of the stack)

Expand Down Expand Up @@ -451,7 +451,7 @@ void canStreamFragmentXml() throws IOException {
try (StreamParser parser = new StreamParser(Parser.xmlParser()).parseFragment(html, context, "")) {
StringBuilder seen = new StringBuilder();
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;", seen.toString());
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;#root;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
// note that we don't get a full doc, just the fragment

Expand Down
4 changes: 3 additions & 1 deletion src/test/java/org/jsoup/safety/CleanerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,8 @@ public void bailsIfRemovingProtocolThatsNotSet() {
Element p = orig.expectFirst("p");
Range origRange = p.sourceRange();
assertEquals("2,2:22-2,10:30", origRange.toString());
assertEquals("1,1:0-1,1:0", orig.sourceRange().toString());
assertEquals("2,19:39-2,19:39", orig.endSourceRange().toString());

Range.AttributeRange attributeRange = p.attributes().sourceRange("id");
assertEquals("2,5:25-2,7:27=2,8:28-2,9:29", attributeRange.toString());
Expand All @@ -404,7 +406,7 @@ public void bailsIfRemovingProtocolThatsNotSet() {
assertEquals("1", cleanP.id());
Range cleanRange = cleanP.sourceRange();
assertEquals(origRange, cleanRange);
assertEquals(orig.endSourceRange(), clean.endSourceRange());
assertEquals(p.endSourceRange(), cleanP.endSourceRange());
assertEquals(attributeRange, cleanP.attributes().sourceRange("id"));
}

Expand Down

0 comments on commit eb602e7

Please sign in to comment.