Skip to content

Commit

Permalink
Merge pull request #1934 from kares/jruby-review-8_DETACHED-merge-back
Browse files Browse the repository at this point in the history
[jruby] cleanup/refactor extension code
  • Loading branch information
flavorjones authored Nov 26, 2019
2 parents 0d2c5aa + 604a0db commit 8a1c981
Show file tree
Hide file tree
Showing 29 changed files with 1,040 additions and 1,212 deletions.
70 changes: 31 additions & 39 deletions ext/java/nokogiri/HtmlDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@

import nokogiri.internals.HtmlDomParserContext;

import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;

/**
* Class for Nokogiri::HTML::Document.
*
Expand All @@ -64,21 +66,25 @@ public class HtmlDocument extends XmlDocument {
public HtmlDocument(Ruby ruby, RubyClass klazz) {
super(ruby, klazz);
}


public HtmlDocument(Ruby runtime, Document document) {
this(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"), document);
}

public HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) {
super(ruby, klazz, doc);
}

@JRubyMethod(name="new", meta = true, rest = true, required=0)
public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz,
IRubyObject[] args) {
public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) {
final Ruby runtime = context.runtime;
HtmlDocument htmlDocument;
try {
Document docNode = createNewDocument();
htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz);
htmlDocument.setDocumentNode(context, docNode);
Document docNode = createNewDocument(runtime);
htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(runtime, (RubyClass) klazz);
htmlDocument.setDocumentNode(context.runtime, docNode);
} catch (Exception ex) {
throw context.getRuntime().newRuntimeError("couldn't create document: " + ex);
throw asRuntimeError(runtime, "couldn't create document: ", ex);
}

Helpers.invoke(context, htmlDocument, "initialize", args);
Expand Down Expand Up @@ -107,36 +113,30 @@ public IRubyObject getInternalSubset(ThreadContext context) {

return internalSubset;
}

public void setDocumentNode(ThreadContext context, Node node) {
super.setNode(context, node);
Ruby runtime = context.getRuntime();
if (node != null) {
Document document = (Document)node;
document.normalize();
stabilzeAttrValue(document.getDocumentElement());
}

@Override
void init(Ruby runtime, Document document) {
stabilizeTextContent(document);
document.normalize();
setInstanceVariable("@decorators", runtime.getNil());
if (document.getDocumentElement() != null) {
stabilizeAttrs(document.getDocumentElement());
}
}

private void stabilzeAttrValue(Node node) {
if (node == null) return;

private static void stabilizeAttrs(Node node) {
if (node.hasAttributes()) {
NamedNodeMap nodeMap = node.getAttributes();
for (int i=0; i<nodeMap.getLength(); i++) {
Node n = nodeMap.item(i);
if (n instanceof Attr) {
Attr attr = (Attr)n;
String attrName = attr.getName();
// not sure, but need to get value always before document is referred.
// or lose attribute value
String attrValue = attr.getValue(); // don't delete this line
stabilizeAttr((Attr) n);
}
}
}
NodeList children = node.getChildNodes();
for (int i=0; i<children.getLength(); i++) {
stabilzeAttrValue(children.item(i));
stabilizeAttrs(children.item(i));
}
}

Expand All @@ -156,14 +156,10 @@ public String getPraedEncoding() {
* and +options+. See Nokogiri::HTML.parse
*/
@JRubyMethod(meta = true, required = 4)
public static IRubyObject read_io(ThreadContext context,
IRubyObject cls,
IRubyObject[] args) {
Ruby ruby = context.getRuntime();
HtmlDomParserContext ctx =
new HtmlDomParserContext(ruby, args[2], args[3]);
public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
ctx.setIOInputSource(context, args[0], args[1]);
return ctx.parse(context, cls, args[1]);
return ctx.parse(context, (RubyClass) klass, args[1]);
}

/*
Expand All @@ -174,13 +170,9 @@ public static IRubyObject read_io(ThreadContext context,
* and +options+. See Nokogiri::HTML.parse
*/
@JRubyMethod(meta = true, required = 4)
public static IRubyObject read_memory(ThreadContext context,
IRubyObject cls,
IRubyObject[] args) {
Ruby ruby = context.getRuntime();
HtmlDomParserContext ctx =
new HtmlDomParserContext(ruby, args[2], args[3]);
public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
ctx.setStringInputSource(context, args[0], args[1]);
return ctx.parse(context, cls, args[1]);
return ctx.parse(context, (RubyClass) klass, args[1]);
}
}
146 changes: 114 additions & 32 deletions ext/java/nokogiri/HtmlSaxParserContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,28 @@

package nokogiri;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.xerces.parsers.AbstractSAXParser;
import org.cyberneko.html.parsers.SAXParser;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.RubyString;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.xml.sax.SAXException;

import nokogiri.internals.NokogiriHandler;
import nokogiri.internals.NokogiriHelpers;
import static nokogiri.internals.NokogiriHelpers.rubyStringToString;

/**
* Class for Nokogiri::HTML::SAX::ParserContext.
Expand All @@ -59,10 +66,16 @@
@JRubyClass(name="Nokogiri::HTML::SAX::ParserContext", parent="Nokogiri::XML::SAX::ParserContext")
public class HtmlSaxParserContext extends XmlSaxParserContext {

static HtmlSaxParserContext newInstance(final Ruby runtime, final RubyClass klazz) {
HtmlSaxParserContext instance = new HtmlSaxParserContext(runtime, klazz);
instance.initialize(runtime);
return instance;
}

public HtmlSaxParserContext(Ruby ruby, RubyClass rubyClass) {
super(ruby, rubyClass);
}

@Override
protected AbstractSAXParser createParser() throws SAXException {
SAXParser parser = new SAXParser();
Expand All @@ -89,10 +102,14 @@ public static IRubyObject parse_memory(ThreadContext context,
IRubyObject klazz,
IRubyObject data,
IRubyObject encoding) {
HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
ctx.initialize(context.getRuntime());
ctx.java_encoding = NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding);
ctx.setStringInputSource(context, data, context.nil);
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
String javaEncoding = findEncodingName(context, encoding);
if (javaEncoding != null) {
CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding);
ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
ctx.setInputSource(istream);
ctx.getInputSource().setEncoding(javaEncoding);
}
return ctx;
}

Expand Down Expand Up @@ -136,58 +153,123 @@ public int getValue() {
public String toString() {
return name;
}

private static transient EncodingType[] values;

// NOTE: assuming ordinal == value
static EncodingType get(final int ordinal) {
EncodingType[] values = EncodingType.values;
if (values == null) {
values = EncodingType.values();
EncodingType.values = values;
}
if (ordinal >= 0 && ordinal < values.length) {
return values[ordinal];
}
return null;
}

}

private static String findEncodingName(final int value) {
EncodingType type = EncodingType.get(value);
if (type == null) return null;
assert type.value == value;
return type.name;
}

private static String findName(final int value) {
for (EncodingType type : EncodingType.values()) {
if (type.getValue() == value) return type.toString();

private static String findEncodingName(ThreadContext context, IRubyObject encoding) {
String rubyEncoding = null;
if (encoding instanceof RubyString) {
rubyEncoding = rubyStringToString((RubyString) encoding);
}
else if (encoding instanceof RubyFixnum) {
rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
}
if (rubyEncoding == null) return null;
try {
return Charset.forName(rubyEncoding).displayName();
}
catch (UnsupportedCharsetException e) {
throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
}
catch (IllegalCharsetNameException e) {
throw context.getRuntime().newInvalidEncoding(e.getMessage());
}
return null;
}

private static String findEncoding(ThreadContext context, IRubyObject encoding) {
// HTML::Sax::Parser leaks a libxml implementation detail and passes an
// Encoding integer to parse_io. We have to reverse map the integer
// into a name.
if (encoding instanceof RubyFixnum) {
int value = RubyFixnum.fix2int((RubyFixnum) encoding);
return findName(value);

private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+", Pattern.CASE_INSENSITIVE);

private static CharSequence applyEncoding(final String input, final String enc) {
int start_pos = 0; int end_pos = 0;
if (containsIgnoreCase(input, "charset")) {
Matcher m = CHARSET_PATTERN.matcher(input);
while (m.find()) {
start_pos = m.start();
end_pos = m.end();
}
}
if (start_pos != end_pos) {
return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
}
return input;
}

return NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding);
private static boolean containsIgnoreCase(final String str, final String sub) {
final int len = sub.length();
final int max = str.length() - len;

if (len == 0) return true;
final char c0Lower = Character.toLowerCase(sub.charAt(0));
final char c0Upper = Character.toUpperCase(sub.charAt(0));

for (int i = 0; i <= max; i++) {
final char ch = str.charAt(i);
if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
continue; // first char doesn't match
}

if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
return true;
}
}
return false;
}

@JRubyMethod(name="file", meta=true)
public static IRubyObject parse_file(ThreadContext context,
IRubyObject klazz,
IRubyObject klass,
IRubyObject data,
IRubyObject encoding) {
HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
ctx.initialize(context.getRuntime());
ctx.java_encoding = NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding);
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass);
ctx.setInputSourceFile(context, data);
String javaEncoding = findEncodingName(context, encoding);
if (javaEncoding != null) {
ctx.getInputSource().setEncoding(javaEncoding);
}
return ctx;
}

@JRubyMethod(name="io", meta=true)
public static IRubyObject parse_io(ThreadContext context,
IRubyObject klazz,
IRubyObject klass,
IRubyObject data,
IRubyObject encoding) {
HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
ctx.initialize(context.getRuntime());
ctx.java_encoding = findEncoding(context, encoding);
ctx.setIOInputSource(context, data, context.getRuntime().getNil());
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass);
ctx.setIOInputSource(context, data, context.nil);
String javaEncoding = findEncodingName(context, encoding);
if (javaEncoding != null) {
ctx.getInputSource().setEncoding(javaEncoding);
}
return ctx;
}

/**
* Create a new parser context that will read from a raw input stream.
* Meant to be run in a separate thread by HtmlSaxPushParser.
*/
static HtmlSaxParserContext parse_stream(final Ruby runtime, RubyClass klazz, InputStream stream) {
HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(runtime, klazz);
ctx.initialize(runtime);
static HtmlSaxParserContext parse_stream(final Ruby runtime, RubyClass klass, InputStream stream) {
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(runtime, klass);
ctx.setInputSource(stream);
return ctx;
}
Expand Down
2 changes: 1 addition & 1 deletion ext/java/nokogiri/NokogiriService.java
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) {
}
};

public static final ObjectAllocator HTML_SAXPARSER_CONTEXT_ALLOCATOR = new ObjectAllocator() {
private static final ObjectAllocator HTML_SAXPARSER_CONTEXT_ALLOCATOR = new ObjectAllocator() {
private HtmlSaxParserContext htmlSaxParserContext = null;
public IRubyObject allocate(Ruby runtime, RubyClass klazz) {
if (htmlSaxParserContext == null) htmlSaxParserContext = new HtmlSaxParserContext(runtime, klazz);
Expand Down
Loading

0 comments on commit 8a1c981

Please sign in to comment.