Skip to content

Commit

Permalink
Flyweight Tag.valueOf in TreeBuilder
Browse files Browse the repository at this point in the history
Lowers memory consumption when many repeated custom tags were in the parse.

(Known tags were already re-used)
  • Loading branch information
jhy committed Sep 27, 2021
1 parent a8df71b commit d3f4e31
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ jsoup changelog
memoizing the </title> scan and reducing GC.
<https://github.com/jhy/jsoup/issues/1644>

* Improvement: when parsing custom tags (in HTML or XML), added a flyweight cache on Tag.valueOf(name) to reduce
memory overhead when many tags are repeated.


* Bugfix: when tracking errors or checking for validity in the Cleaner, errors were incorrectly raised for missing
optional closing tags.

Expand Down
10 changes: 5 additions & 5 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ List<Node> parseFragment(String inputFragment, @Nullable Element context, String
default:
tokeniser.transition(TokeniserState.Data);
}
root = new Element(Tag.valueOf(contextTag, settings), baseUri);
root = new Element(tagFor(contextTag, settings), baseUri);
doc.appendChild(root);
stack.add(root);
resetInsertionMode();
Expand Down Expand Up @@ -245,13 +245,13 @@ Element insert(final Token.StartTag startTag) {
return el;
}

Element el = new Element(Tag.valueOf(startTag.name(), settings), null, settings.normalizeAttributes(startTag.attributes));
Element el = new Element(tagFor(startTag.name(), settings), null, settings.normalizeAttributes(startTag.attributes));
insert(el);
return el;
}

Element insertStartTag(String startTagName) {
Element el = new Element(Tag.valueOf(startTagName, settings), null);
Element el = new Element(tagFor(startTagName, settings), null);
insert(el);
return el;
}
Expand All @@ -262,7 +262,7 @@ void insert(Element el) {
}

Element insertEmpty(Token.StartTag startTag) {
Tag tag = Tag.valueOf(startTag.name(), settings);
Tag tag = tagFor(startTag.name(), settings);
Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes));
insertNode(el);
if (startTag.isSelfClosing()) {
Expand All @@ -277,7 +277,7 @@ Element insertEmpty(Token.StartTag startTag) {
}

FormElement insertForm(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) {
Tag tag = Tag.valueOf(startTag.name(), settings);
Tag tag = tagFor(startTag.name(), settings);
FormElement el = new FormElement(tag, null, settings.normalizeAttributes(startTag.attributes));
if (checkTemplateStack) {
if(!onStack("template"))
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Original file line number Diff line number Diff line change
Expand Up @@ -891,7 +891,7 @@ else if (!tb.onStack(formatEl)) {
} else if (node == formatEl)
break;

Element replacement = new Element(Tag.valueOf(node.nodeName(), ParseSettings.preserveCase), tb.getBaseUri());
Element replacement = new Element(tb.tagFor(node.nodeName(), ParseSettings.preserveCase), tb.getBaseUri());
// case will follow the original node (so honours ParseSettings)
tb.replaceActiveFormattingElement(node, replacement);
tb.replaceOnStack(node, replacement);
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/org/jsoup/parser/TreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
import javax.annotation.ParametersAreNonnullByDefault;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* @author Jonathan Hedley
Expand All @@ -24,6 +26,7 @@ abstract class TreeBuilder {
protected String baseUri; // current base uri, for creating new elements
protected Token currentToken; // currentToken is used only for error tracking.
protected ParseSettings settings;
protected Map<String, Tag> seenTags; // tags we've used in this parse; saves tag GC for custom tags.

private Token.StartTag start = new Token.StartTag(); // start tag to process
private Token.EndTag end = new Token.EndTag();
Expand All @@ -44,6 +47,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
currentToken = null;
tokeniser = new Tokeniser(reader, parser.getErrors());
stack = new ArrayList<>(32);
seenTags = new HashMap<>();
this.baseUri = baseUri;
}

Expand All @@ -57,6 +61,7 @@ Document parse(Reader input, String baseUri, Parser parser) {
reader = null;
tokeniser = null;
stack = null;
seenTags = null;

return doc;
}
Expand Down Expand Up @@ -159,4 +164,13 @@ protected void error(String msg, Object... args) {
protected boolean isContentForTagData(String normalName) {
return false;
}

protected Tag tagFor(String tagName, ParseSettings settings) {
Tag tag = seenTags.get(tagName); // note that we don't normalize the cache key. But tag via valueOf may be normalized.
if (tag == null) {
tag = Tag.valueOf(tagName, settings);
seenTags.put(tagName, tag);
}
return tag;
}
}
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/XmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ private void insertNode(Node node) {
}

Element insert(Token.StartTag startTag) {
Tag tag = Tag.valueOf(startTag.name(), settings);
Tag tag = tagFor(startTag.name(), settings);
// todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html.
if (startTag.hasAttributes())
startTag.attributes.deduplicate(settings);
Expand Down
16 changes: 16 additions & 0 deletions src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -298,4 +298,20 @@ public void handlesLTinScript() {
String out = doc.html();
assertEquals("<body style=\"color: red\" name=\"\"><div></div></body>", out);
}

@Test void customTagsAreFlyweights() {
String xml = "<foo>Foo</foo><foo>Foo</foo><FOO>FOO</FOO><FOO>FOO</FOO>";
Document doc = Jsoup.parse(xml, Parser.xmlParser());
Elements els = doc.children();

Tag t1 = els.get(0).tag();
Tag t2 = els.get(1).tag();
Tag t3 = els.get(2).tag();
Tag t4 = els.get(3).tag();
assertEquals("foo", t1.getName());
assertEquals("FOO", t3.getName());
assertSame(t1, t2);
assertSame(t3, t4);

}
}

0 comments on commit d3f4e31

Please sign in to comment.