Skip to content

Commit

Permalink
{XML,HTML4,HTML5}::{Document,DocumentFragment}{.parse,#initialize} ta…
Browse files Browse the repository at this point in the history
…ke keyword arguments (#3355)

**What problem is this PR intended to solve?**

As part of #3323 there were a few RubyConf 2024 Hack Day pull requests
addressing Document and DocumentFragment constructors which were related
to, or blocked on, some changes to the C code.

So this PR is a mega-PR that merges all those PRs and unifies the code
and doc style:

- #3327 
- #3336 
- #3334 
- #3335 

But in addition to those PRs also updates:

- the `XML::DocumentFragment` new/initialize argument handling for CRuby
and JRuby
- `XML::DocumentFragment#initialize` kwargs
- `HTML4::Document.parse` kwargs
- general improvement of documentation

**Have you included adequate test coverage?**

I think so!

**Does this change affect the behavior of either the C or the Java
implementations?**

The XML::DocumentFragment allocator has changed, but both
implementations have been updated.
  • Loading branch information
flavorjones authored Dec 8, 2024
2 parents 7b87461 + ac9fb8a commit 92d2e4b
Show file tree
Hide file tree
Showing 18 changed files with 493 additions and 284 deletions.
1 change: 0 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ We've resolved many long-standing bugs in the various schema classes, validation

* The undocumented and unused method `Nokogiri::CSS.parse` is now deprecated and will generate a warning. The AST returned by this method is private and subject to change and removal in future versions of Nokogiri. This method will be removed in a future version of Nokogiri.
* Passing an options hash to `CSS.xpath_for` is now deprecated and will generate a warning. Use keyword arguments instead. This will become an error in a future version of Nokogiri.
* Passing an options hash to `HTML5::DocumentFragment.parse` is now deprecated and will generate a warning. Use keyword arguments instead. This will become an error in a future version of Nokogiri.
* Passing libxml2 encoding IDs to `SAX::ParserContext` methods is now deprecated and will generate a warning. The use of `SAX::Parser::ENCODINGS` is also deprecated. Use `Encoding` objects or encoding names instead.


Expand Down
96 changes: 4 additions & 92 deletions ext/java/nokogiri/XmlDocumentFragment.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,112 +48,24 @@ public class XmlDocumentFragment extends XmlNode
super(ruby, klazz);
}

@JRubyMethod(name = "new", meta = true, required = 1, optional = 3)
@JRubyMethod(name = "native_new", meta = true)
public static IRubyObject
rbNew(ThreadContext context, IRubyObject cls, IRubyObject[] args, Block block)
rbNew(ThreadContext context, IRubyObject cls, IRubyObject value)
{
if (args.length < 1) {
throw context.runtime.newArgumentError(args.length, 1);
}

if (!(args[0] instanceof XmlDocument)) {
if (!(value instanceof XmlDocument)) {
throw context.runtime.newArgumentError("first parameter must be a Nokogiri::XML::Document instance");
}

XmlDocument doc = (XmlDocument) args[0];

// make well-formed fragment, ignore invalid namespace, or add appropriate namespace to parse
if (args.length > 1 && args[1] instanceof RubyString) {
final RubyString arg1 = (RubyString) args[1];
if (XmlDocumentFragment.isTag(arg1)) {
args[1] = RubyString.newString(context.runtime, addNamespaceDeclIfNeeded(doc, rubyStringToString(arg1)));
}
}
XmlDocument doc = (XmlDocument) value;

XmlDocumentFragment fragment = (XmlDocumentFragment) NokogiriService.XML_DOCUMENT_FRAGMENT_ALLOCATOR.allocate(
context.runtime, (RubyClass)cls);
fragment.setDocument(context, doc);
fragment.setNode(context.runtime, doc.getDocument().createDocumentFragment());

Helpers.invoke(context, fragment, "initialize", args, block);
return fragment;
}

private static final ByteList TAG_BEG = ByteList.create("<");
private static final ByteList TAG_END = ByteList.create(">");

private static boolean
isTag(final RubyString str)
{
return str.getByteList().startsWith(TAG_BEG) && str.getByteList().endsWith(TAG_END);
}

private static boolean
isNamespaceDefined(String qName, NamedNodeMap nodeMap)
{
if (isNamespace(qName.intern())) { return true; }
for (int i = 0; i < nodeMap.getLength(); i++) {
Attr attr = (Attr)nodeMap.item(i);
if (isNamespace(attr.getNodeName())) {
String localPart = getLocalNameForNamespace(attr.getNodeName(), null);
if (getPrefix(qName).equals(localPart)) {
return true;
}
}
}
return false;
}

private static final Pattern QNAME_RE = Pattern.compile("[^</:>\\s]+:[^</:>=\\s]+");
private static final Pattern START_TAG_RE = Pattern.compile("<[^</>]+>");

private static String
addNamespaceDeclIfNeeded(XmlDocument doc, String tags)
{
if (doc.getDocument() == null) { return tags; }
if (doc.getDocument().getDocumentElement() == null) { return tags; }
Matcher matcher = START_TAG_RE.matcher(tags);
Map<CharSequence, CharSequence> rewriteTable = null;
while (matcher.find()) {
String start_tag = matcher.group();
Matcher matcher2 = QNAME_RE.matcher(start_tag);
while (matcher2.find()) {
String qName = matcher2.group();
NamedNodeMap nodeMap = doc.getDocument().getDocumentElement().getAttributes();
if (isNamespaceDefined(qName, nodeMap)) {
CharSequence namespaceDecl = getNamespaceDecl(getPrefix(qName), nodeMap);
if (namespaceDecl != null) {
if (rewriteTable == null) { rewriteTable = new HashMap<CharSequence, CharSequence>(8, 1); }
StringBuilder str = new StringBuilder(qName.length() + namespaceDecl.length() + 3);
String key = str.append('<').append(qName).append('>').toString();
str.setCharAt(key.length() - 1, ' '); // (last) '>' -> ' '
rewriteTable.put(key, str.append(namespaceDecl).append('>'));
}
}
}
}
if (rewriteTable != null) {
for (Map.Entry<CharSequence, CharSequence> e : rewriteTable.entrySet()) {
tags = tags.replace(e.getKey(), e.getValue());
}
}

return tags;
}

private static CharSequence
getNamespaceDecl(final String prefix, NamedNodeMap nodeMap)
{
for (int i = 0; i < nodeMap.getLength(); i++) {
Attr attr = (Attr) nodeMap.item(i);
if (prefix.equals(attr.getLocalName())) {
return new StringBuilder().
append(attr.getName()).append('=').append('"').append(attr.getValue()).append('"');
}
}
return null;
}

@Override
public void
relink_namespace(ThreadContext context)
Expand Down
10 changes: 8 additions & 2 deletions ext/nokogiri/html4_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ static ID id_to_s;

/*
* call-seq:
* new
* new(uri=nil, external_id=nil) → HTML4::Document
*
* Create a new document
* Create a new empty document with base URI +uri+ and external ID +external_id+.
*/
static VALUE
rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
Expand Down Expand Up @@ -151,6 +151,12 @@ rb_html_document_type(VALUE self)
void
noko_init_html_document(void)
{
/* this is here so that rdoc doesn't ignore this file. */
/*
mNokogiri = rb_define_module("Nokogiri");
mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
*/

assert(cNokogiriXmlDocument);
cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);

Expand Down
4 changes: 2 additions & 2 deletions ext/nokogiri/nokogiri.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,8 @@ Init_nokogiri(void)
{
mNokogiri = rb_define_module("Nokogiri");
mNokogiriGumbo = rb_define_module_under(mNokogiri, "Gumbo");
mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX");
mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX");
mNokogiriHtml5 = rb_define_module_under(mNokogiri, "HTML5");
mNokogiriXml = rb_define_module_under(mNokogiri, "XML");
mNokogiriXmlSax = rb_define_module_under(mNokogiriXml, "SAX");
Expand Down
12 changes: 7 additions & 5 deletions ext/nokogiri/xml_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,8 @@ noko_xml_document_s_read_io(VALUE rb_class,
VALUE rb_encoding,
VALUE rb_options)
{
/* TODO: deprecate this method, parse should be the preferred entry point. then we can make this
private. */
libxmlStructuredErrorHandlerState handler_state;
VALUE rb_errors = rb_ary_new();

Expand Down Expand Up @@ -417,6 +419,8 @@ noko_xml_document_s_read_memory(VALUE rb_class,
VALUE rb_encoding,
VALUE rb_options)
{
/* TODO: deprecate this method, parse should be the preferred entry point. then we can make this
private. */
VALUE rb_errors = rb_ary_new();
xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher);

Expand Down Expand Up @@ -444,9 +448,9 @@ noko_xml_document_s_read_memory(VALUE rb_class,

/*
* call-seq:
* new(version = default)
* new(version = "1.0")
*
* Create a new document with +version+ (defaults to "1.0")
* Create a new empty document declaring XML version +version+.
*/
static VALUE
new (int argc, VALUE *argv, VALUE klass)
Expand Down Expand Up @@ -756,9 +760,7 @@ void
noko_init_xml_document(void)
{
assert(cNokogiriXmlNode);
/*
* Nokogiri::XML::Document wraps an xml document.
*/

cNokogiriXmlDocument = rb_define_class_under(mNokogiriXml, "Document", cNokogiriXmlNode);

rb_define_alloc_func(cNokogiriXmlDocument, _xml_document_alloc);
Expand Down
35 changes: 10 additions & 25 deletions ext/nokogiri/xml_document_fragment.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,18 @@

VALUE cNokogiriXmlDocumentFragment;

/*
* call-seq:
* new(document)
*
* Create a new DocumentFragment element on the +document+
*/
/* :nodoc: */
static VALUE
new (int argc, VALUE *argv, VALUE klass)
noko_xml_document_fragment_s_native_new(VALUE klass, VALUE rb_doc)
{
xmlDocPtr xml_doc;
xmlNodePtr node;
VALUE document;
VALUE rest;
xmlDocPtr c_doc;
xmlNodePtr c_node;
VALUE rb_node;

rb_scan_args(argc, argv, "1*", &document, &rest);

xml_doc = noko_xml_document_unwrap(document);

node = xmlNewDocFragment(xml_doc->doc);

noko_xml_document_pin_node(node);

rb_node = noko_xml_node_wrap(klass, node);
rb_obj_call_init(rb_node, argc, argv);
c_doc = noko_xml_document_unwrap(rb_doc);
c_node = xmlNewDocFragment(c_doc->doc);
noko_xml_document_pin_node(c_node);
rb_node = noko_xml_node_wrap(klass, c_node);

return rb_node;
}
Expand All @@ -35,10 +22,8 @@ void
noko_init_xml_document_fragment(void)
{
assert(cNokogiriXmlNode);
/*
* DocumentFragment represents a DocumentFragment node in an xml document.
*/

cNokogiriXmlDocumentFragment = rb_define_class_under(mNokogiriXml, "DocumentFragment", cNokogiriXmlNode);

rb_define_singleton_method(cNokogiriXmlDocumentFragment, "new", new, -1);
rb_define_singleton_method(cNokogiriXmlDocumentFragment, "native_new", noko_xml_document_fragment_s_native_new, 1);
}
67 changes: 44 additions & 23 deletions lib/nokogiri/html4/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -161,52 +161,73 @@ def xpath_doctype
end

class << self
###
# Parse HTML. +string_or_io+ may be a String, or any object that
# responds to _read_ and _close_ such as an IO, or StringIO.
# +url+ is resource where this document is located. +encoding+ is the
# encoding that should be used when processing the document. +options+
# is a number that sets options in the parser, such as
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
# Nokogiri::XML::ParseOptions.
def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
# :call-seq:
# parse(input) { |options| ... } => Nokogiri::HTML4::Document
# parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
#
# Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
#
# [Required Parameters]
# - +input+ (String | IO) The content to be parsed.
#
# [Optional Keyword Arguments]
# - +url:+ (String) The base URI for this document.
#
# - +encoding:+ (String) The name of the encoding that should be used when processing the
# document. When not provided, the encoding will be determined based on the document
# content.
#
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
# behaviors during parsing. See ParseOptions for more information. The default value is
# +ParseOptions::DEFAULT_HTML+.
#
# [Yields]
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
# can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
#
# [Returns] Nokogiri::HTML4::Document
def parse(
input,
url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
url: url_, encoding: encoding_, options: options_
)
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
yield options if block_given?

url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
url ||= input.respond_to?(:path) ? input.path : nil

if string_or_io.respond_to?(:encoding)
unless string_or_io.encoding == Encoding::ASCII_8BIT
encoding ||= string_or_io.encoding.name
if input.respond_to?(:encoding)
unless input.encoding == Encoding::ASCII_8BIT
encoding ||= input.encoding.name
end
end

if string_or_io.respond_to?(:read)
if string_or_io.is_a?(Pathname)
if input.respond_to?(:read)
if input.is_a?(Pathname)
# resolve the Pathname to the file and open it as an IO object, see #2110
string_or_io = string_or_io.expand_path.open
url ||= string_or_io.path
input = input.expand_path.open
url ||= input.path
end

unless encoding
string_or_io = EncodingReader.new(string_or_io)
input = EncodingReader.new(input)
begin
return read_io(string_or_io, url, encoding, options.to_i)
return read_io(input, url, encoding, options.to_i)
rescue EncodingReader::EncodingFound => e
encoding = e.found_encoding
end
end
return read_io(string_or_io, url, encoding, options.to_i)
return read_io(input, url, encoding, options.to_i)
end

# read_memory pukes on empty docs
if string_or_io.nil? || string_or_io.empty?
if input.nil? || input.empty?
return encoding ? new.tap { |i| i.encoding = encoding } : new
end

encoding ||= EncodingReader.detect_encoding(string_or_io)
encoding ||= EncodingReader.detect_encoding(input)

read_memory(string_or_io, url, encoding, options.to_i)
read_memory(input, url, encoding, options.to_i)
end
end
end
Expand Down
Loading

0 comments on commit 92d2e4b

Please sign in to comment.