Skip to content

Commit

Permalink
Improved accuracy of tracked source positions (#2056)
Browse files Browse the repository at this point in the history
Revised tree builder method names
Using more explicit names vs overrides; removed some duplication.
Improved accuracy of tracked source positions
Track implicit vs explicitly created / closed elements
  • Loading branch information
jhy authored Nov 20, 2023
1 parent 6307b94 commit 2cf9e90
Show file tree
Hide file tree
Showing 8 changed files with 452 additions and 307 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ Release 1.17.1 [PENDING]
* Improvement: repackaged the library with native (vs automatic) JPMS module support.
<https://github.com/jhy/jsoup/pull/2025>

* Improvement: better fidelity of source positions when tracking is enabled. And implicitly created or closed elements
are tracked and detectable via Range.isImplicit().
<https://github.com/jhy/jsoup/pull/2056>

* Bugfix: when outputting with XML syntax, HTML elements that were parsed as data nodes (<script> and <style>) should
be emitted as CDATA nodes, so that they can be parsed correctly by an XML parser.
<https://github.com/jhy/jsoup/pull/1720>
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/org/jsoup/nodes/LeafNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

import java.util.List;

abstract class LeafNode extends Node {
/**
A node that does not hold any children. E.g.: {@link TextNode}, {@link DataNode}, {@link Comment}.
*/
public abstract class LeafNode extends Node {
Object value; // either a string value, or an attribute map (in the rare case multiple attributes are set)

protected final boolean hasAttributes() {
Expand Down
34 changes: 33 additions & 1 deletion src/main/java/org/jsoup/nodes/Range.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ public Position start() {
return start;
}

/**
Get the starting cursor position of this range.
@return the 0-based start cursor position.
@since 1.17.1
*/
public int startPos() {
return start.pos;
}

/**
Get the end position of this node.
* @return the end position
Expand All @@ -43,6 +52,15 @@ public Position end() {
return end;
}

/**
Get the ending cursor position of this range.
@return the 0-based ending cursor position.
@since 1.17.1
*/
public int endPos() {
return end.pos;
}

/**
Test if this source range was tracked during parsing.
* @return true if this was tracked during parsing, false otherwise (and all fields will be {@code -1}).
Expand All @@ -51,6 +69,20 @@ public boolean isTracked() {
return this != Untracked;
}

/**
Checks if the range represents a node that was implicitly created / closed.
<p>For example, with HTML of {@code <p>One<p>Two}, both {@code p} elements will have an explicit
{@link Element#sourceRange()} but an implicit {@link Element#endSourceRange()} marking the end position, as neither
have closing {@code </p>} tags. The TextNodes will have explicit sourceRanges.
<p>A range is considered implicit if its start and end positions are the same.
@return true if the range is tracked and its start and end positions are the same, false otherwise.
@since 1.17.1
*/
public boolean isImplicit() {
if (!isTracked()) return false;
return start.equals(end);
}

/**
Retrieves the source range for a given Node.
* @param node the node to retrieve the position for
Expand Down Expand Up @@ -124,7 +156,7 @@ public Position(int pos, int lineNumber, int columnNumber) {

/**
Gets the position index (0-based) of the original input source that this Position was read at. This tracks the
total number of characters read into the source at this position, regardless of the number of preceeding lines.
total number of characters read into the source at this position, regardless of the number of preceding lines.
* @return the position, or {@code -1} if untracked.
*/
public int pos() {
Expand Down
Loading

0 comments on commit 2cf9e90

Please sign in to comment.