Skip to content

Commit

Permalink
Refactored UserData to be tucked into a hash (#2060)
Browse files Browse the repository at this point in the history
And make sure attribute source ranges are copied in the cleaner
  • Loading branch information
jhy authored Nov 23, 2023
1 parent 58521a4 commit 73d4506
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 96 deletions.
10 changes: 4 additions & 6 deletions src/main/java/org/jsoup/internal/SharedConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
this package when modules are enabled.
*/
public final class SharedConstants {
// Indicates a jsoup internal key. Can't be set via HTML. (It could be set via accessor, but not too worried about
// that. Suppressed from list, iter.
public static final char InternalPrefix = '/';
public static final String PrivatePrefix = "/jsoup.";

public static final String AttrRange = PrivatePrefix + "attrRange.";
public static final String UserDataKey = "/jsoup.userdata";
public final static String AttrRangeKey = "jsoup.attrs";
public static final String RangeKey = "jsoup.start";
public static final String EndRangeKey = "jsoup.end";

public static final int DefaultBufferSize = 1024 * 32;

Expand Down
10 changes: 1 addition & 9 deletions src/main/java/org/jsoup/nodes/Attribute.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ Get the source ranges (start to end positions) in the original input source from
@since 1.17.1
*/
public Range.AttributeRange sourceRange() {
if (parent == null) return Range.AttributeRange.Untracked;
if (parent == null) return Range.AttributeRange.UntrackedAttr;
return parent.sourceRange(key);
}

Expand Down Expand Up @@ -211,14 +211,6 @@ protected static boolean isDataAttribute(String key) {
return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
}

/**
Is this an internal attribute? Internal attributes can be fetched by key, but are not serialized.
* @return if an internal attribute.
*/
public boolean isInternal() {
return Attributes.isInternalKey(key);
}

/**
* Collapsible if it's a boolean attribute and value is empty or same as name
*
Expand Down
79 changes: 50 additions & 29 deletions src/main/java/org/jsoup/nodes/Attributes.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;

import static org.jsoup.internal.Normalizer.lowerCase;
import static org.jsoup.nodes.Range.AttributeRange.Untracked;
import static org.jsoup.internal.SharedConstants.AttrRangeKey;
import static org.jsoup.nodes.Range.AttributeRange.UntrackedAttr;

/**
* The attributes of an Element.
Expand All @@ -39,6 +41,10 @@
* @author Jonathan Hedley, [email protected]
*/
public class Attributes implements Iterable<Attribute>, Cloneable {
// Indicates an internal key. Can't be set via HTML. (It could be set via accessor, but not too worried about
// that. Suppressed from list, iter.)
static final char InternalPrefix = '/';

// The Attributes object is only created on the first use of an attribute; the Element will just have a null
// Attribute slot otherwise
protected static final String dataPrefix = "data-";
Expand Down Expand Up @@ -114,21 +120,6 @@ public String getIgnoreCase(String key) {
return i == NotFound ? EmptyString : checkNotNull(vals[i]);
}

/**
Get an arbitrary user data object by key.
* @param key case-sensitive key to the object.
* @return the object associated to this key, or {@code null} if not found.
* @see #userData(String key, Object val)
* @since 1.17.2
*/
@Nullable
public Object userData(String key) {
Validate.notNull(key);
if (!isInternalKey(key)) key = internalKey(key);
int i = indexOfKey(key);
return i == NotFound ? null : vals[i];
}

/**
* Adds a new attribute. Will produce duplicates if the key already exists.
* @see Attributes#put(String, String)
Expand Down Expand Up @@ -161,6 +152,40 @@ public Attributes put(String key, @Nullable String value) {
return this;
}

/**
Get the map holding any user-data associated with these Attributes. Will be created empty on first use. Held as
an internal attribute, not a field member, to reduce the memory footprint of Attributes when not used. Can hold
arbitrary objects; use for source ranges, connecting W3C nodes to Elements, etc.
* @return the map holding user-data
*/
Map<String, Object> userData() {
final Map<String, Object> userData;
int i = indexOfKey(SharedConstants.UserDataKey);
if (i == NotFound) {
userData = new HashMap<>();
addObject(SharedConstants.UserDataKey, userData);
} else {
//noinspection unchecked
userData = (Map<String, Object>) vals[i];
}
return userData;
}

/**
Get an arbitrary user-data object by key.
* @param key case-sensitive key to the object.
* @return the object associated to this key, or {@code null} if not found.
* @see #userData(String key, Object val)
* @since 1.17.1
*/
@Nullable
public Object userData(String key) {
Validate.notNull(key);
if (!hasKey(SharedConstants.UserDataKey)) return null; // no user data exists
Map<String, Object> userData = userData();
return userData.get(key);
}

/**
Set an arbitrary user-data object by key. Will be treated as an internal attribute, so will not be emitted in HTML.
* @param key case-sensitive key
Expand All @@ -171,13 +196,7 @@ public Attributes put(String key, @Nullable String value) {
*/
public Attributes userData(String key, Object value) {
Validate.notNull(key);
if (!isInternalKey(key)) key = internalKey(key);
Validate.notNull(value);
int i = indexOfKey(key);
if (i != NotFound)
vals[i] = value;
else
addObject(key, value);
userData().put(key, value);
return this;
}

Expand Down Expand Up @@ -340,10 +359,12 @@ Get the source ranges (start to end position) in the original input source from
@since 1.17.1
*/
public Range.AttributeRange sourceRange(String key) {
if (!hasKey(key)) return Untracked;
final String rangeKey = SharedConstants.AttrRange + key;
if (!hasDeclaredValueForKey(rangeKey)) return Untracked;
return (Range.AttributeRange) Validate.ensureNotNull(userData(rangeKey));
if (!hasKey(key)) return UntrackedAttr;
//noinspection unchecked
Map<String, Range.AttributeRange> ranges = (Map<String, Range.AttributeRange>) userData(AttrRangeKey);
if (ranges == null) return Range.AttributeRange.UntrackedAttr;
Range.AttributeRange range = ranges.get(key);
return range != null ? range : Range.AttributeRange.UntrackedAttr;
}

@Override
Expand Down Expand Up @@ -592,10 +613,10 @@ private static String dataKey(String key) {
}

static String internalKey(String key) {
return SharedConstants.InternalPrefix + key;
return InternalPrefix + key;
}

static boolean isInternalKey(String key) {
return key != null && key.length() > 1 && key.charAt(0) == SharedConstants.InternalPrefix;
return key != null && key.length() > 1 && key.charAt(0) == InternalPrefix;
}
}
9 changes: 6 additions & 3 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -1821,7 +1821,9 @@ public Element clone() {
@Override
public Element shallowClone() {
// simpler than implementing a clone version with no child copy
return new Element(tag, baseUri(), attributes == null ? null : attributes.clone());
String baseUri = baseUri();
if (baseUri.equals("")) baseUri = null; // saves setting a blank internal attribute
return new Element(tag, baseUri, attributes == null ? null : attributes.clone());
}

@Override
Expand All @@ -1838,8 +1840,9 @@ protected Element doClone(@Nullable Node parent) {
@Override
public Element clearAttributes() {
if (attributes != null) {
super.clearAttributes();
attributes = null;
super.clearAttributes(); // keeps internal attributes via iterator
if (attributes.size() == 0)
attributes = null; // only remove entirely if no internal attributes
}

return this;
Expand Down
26 changes: 8 additions & 18 deletions src/main/java/org/jsoup/nodes/Range.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
package org.jsoup.nodes;

import org.jsoup.helper.Validate;
import org.jsoup.internal.SharedConstants;

import java.util.Objects;

import static org.jsoup.internal.SharedConstants.*;

/**
Expand All @@ -15,11 +10,8 @@
@since 1.15.2
*/
public class Range {
private final Position start, end;

private static final String RangeKey = PrivatePrefix + "sourceRange";
private static final String EndRangeKey = PrivatePrefix + "endSourceRange";
private static final Position UntrackedPos = new Position(-1, -1, -1);
private final Position start, end;

/** An untracked source range. */
static final Range Untracked = new Range(UntrackedPos, UntrackedPos);
Expand Down Expand Up @@ -98,18 +90,16 @@ public boolean isImplicit() {
*/
static Range of(Node node, boolean start) {
final String key = start ? RangeKey : EndRangeKey;
if (!node.hasAttr(key)) return Untracked;
return (Range) Validate.ensureNotNull(node.attributes().userData(key));
if (!node.hasAttributes()) return Untracked;
Object range = node.attributes().userData(key);
return range != null ? (Range) range : Untracked;
}

/**
Internal jsoup method, called by the TreeBuilder. Tracks a Range for a Node.
* @param node the node to associate this position to
* @param start if this is the starting range. {@code false} for Element end tags.
@deprecated no-op; internal method moved out of visibility
*/
public void track(Node node, boolean start) {
node.attributes().userData(start ? RangeKey : EndRangeKey, this);
}
@Deprecated
public void track(Node node, boolean start) {}

@Override
public boolean equals(Object o) {
Expand Down Expand Up @@ -222,7 +212,7 @@ public int hashCode() {
}

public static class AttributeRange {
static final AttributeRange Untracked = new AttributeRange(Range.Untracked, Range.Untracked);
static final AttributeRange UntrackedAttr = new AttributeRange(Range.Untracked, Range.Untracked);

private final Range nameRange;
private final Range valueRange;
Expand Down
20 changes: 13 additions & 7 deletions src/main/java/org/jsoup/parser/Token.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.internal.SharedConstants;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Range;
import org.jspecify.annotations.Nullable;

import java.util.HashMap;
import java.util.Map;

import static org.jsoup.internal.SharedConstants.*;


Expand Down Expand Up @@ -186,6 +188,15 @@ private void trackAttributeRange(String name) {
if (trackSource && isStartTag()) {
final StartTag start = asStartTag();
final CharacterReader r = start.reader;
assert attributes != null;
//noinspection unchecked
Map<String, Range.AttributeRange> attrRanges =
(Map<String, Range.AttributeRange>) attributes.userData(AttrRangeKey);
if (attrRanges == null) {
attrRanges = new HashMap<>();
attributes.userData(AttrRangeKey, attrRanges);
}
if (attrRanges.containsKey(name)) return; // dedupe ranges on case-sensitive name as we go; actual attributes get deduped later

// if there's no value (e.g. boolean), make it an implicit range at current
if (!hasAttrValue) attrValStart = attrValEnd = attrNameEnd;
Expand All @@ -198,12 +209,7 @@ private void trackAttributeRange(String name) {
new Range.Position(attrValStart, r.lineNumber(attrValStart), r.columnNumber(attrValStart)),
new Range.Position(attrValEnd, r.lineNumber(attrValEnd), r.columnNumber(attrValEnd)))
);

// todo - deduping as we go as case-sensitive key; want first
String key = AttrRange + name;

assert attributes != null;
attributes.userData(key, range);
attrRanges.put(name, range);
}
}

Expand Down
3 changes: 1 addition & 2 deletions src/main/java/org/jsoup/parser/TreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import org.jsoup.helper.Validate;
import org.jsoup.internal.SharedConstants;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
Expand Down Expand Up @@ -279,6 +278,6 @@ private void trackNodePosition(Node node, boolean isStart) {
Range.Position endPosition = new Range.Position
(endPos, reader.lineNumber(endPos), reader.columnNumber(endPos));
Range range = new Range(startPosition, endPosition);
range.track(node, isStart);
node.attributes().userData(isStart ? SharedConstants.RangeKey : SharedConstants.EndRangeKey, range);
}
}
19 changes: 5 additions & 14 deletions src/main/java/org/jsoup/safety/Cleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,12 @@
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.ParseErrorList;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import java.util.List;


/**
The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
that you are expecting; no junk, and no cross-site scripting attacks!
Expand Down Expand Up @@ -178,11 +175,12 @@ private int copySafeNodes(Element source, Element dest) {
}

private ElementMeta createSafeElement(Element sourceEl) {
Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
Element dest = new Element(sourceEl.tag(), sourceEl.baseUri(), destAttrs);
int numDiscarded = 0;
Attributes destAttrs = dest.attributes();
dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy

int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
Expand All @@ -192,14 +190,7 @@ private ElementMeta createSafeElement(Element sourceEl) {
}
Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag);
destAttrs.addAll(enforcedAttrs);

// Copy the original start and end range, if set
// TODO - might be good to make a generic Element#userData set type interface, and copy those all over
if (sourceEl.sourceRange().isTracked())
sourceEl.sourceRange().track(dest, true);
if (sourceEl.endSourceRange().isTracked())
sourceEl.endSourceRange().track(dest, false);

dest.attributes().addAll(destAttrs); // re-attach, if removed in clear
return new ElementMeta(dest, numDiscarded);
}

Expand Down
2 changes: 0 additions & 2 deletions src/test/java/org/jsoup/nodes/PositionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,6 @@ private void printRange(Node node) {

StringBuilder track = new StringBuilder();
for (Attribute attr : div.attributes()) {
if (attr.isInternal()) continue;

Range.AttributeRange attrRange = attr.sourceRange();
assertTrue(attrRange.nameRange().isTracked());
Expand Down Expand Up @@ -339,7 +338,6 @@ private void printRange(Node node) {

StringBuilder track = new StringBuilder();
for (Attribute attr : div.attributes()) {
if (attr.isInternal()) continue;
Range.AttributeRange attrRange = attr.sourceRange();
assertTrue(attrRange.nameRange().isTracked());
assertTrue(attrRange.valueRange().isTracked());
Expand Down
Loading

0 comments on commit 73d4506

Please sign in to comment.