Skip to content

Commit

Permalink
Merge pull request #2602 from sparklemotion/flavorjones-fix-reader-no…
Browse files Browse the repository at this point in the history
…de-gc_backport-v1.13.x

fix: XML::Reader XML::Attr garbage collection (backport to v1.13.x)

---

**What problem is this PR intended to solve?**

This is a proposed fix for #2598, see that issue for an extended explanation of the problem.

This PR implements "option 2" from that issue's proposed solutions:

- introduce a new `Reader#attribute_hash` that will return a `Hash<String ⇒ String>` (instead of an `Array<XML::Attr>`)
- deprecate `Reader#attribute_nodes` with a plan to remove it entirely in a future release
- re-implement `Reader#attributes` to use `#attribute_hash` (instead of `#attribute_nodes`)

After this change, only applications calling `Reader#attribute_nodes` directly will be running the unsafe code. These users will see a deprecation warning and may use `#attribute_hash` as a replacement.

I think it's very possible that `Reader#attribute_hash` won't meet the needs of people who are working with namespaced attributes and are using `#attribute_nodes` for this purpose. However, I'm intentionally deferring any attempt to solve that DX problem until someone who needs this functionality asks for it.

**Have you included adequate test coverage?**

I tried and failed to add test coverage to the suite that would reproduce the underlying GC bug.

However, existing test coverage of `Reader#attributes` is sufficient for now.


**Does this change affect the behavior of either the C or the Java implementations?**

This PR modifies both the C and Java implementations to behave the same.

Notably, the Java implementation contains a small bugfix which is that `Reader#namespaces` now returns an empty hash when there are no namespaces (it previously returned `nil`).
  • Loading branch information
flavorjones authored Jul 22, 2022
2 parents 88b4730 + 80e888c commit d6c4a2b
Show file tree
Hide file tree
Showing 10 changed files with 215 additions and 32 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -282,12 +282,12 @@ jobs:
fail-fast: false
matrix:
sys: ["enable", "disable"]
runs-on: macos-10.15
runs-on: macos-12
steps:
- uses: actions/checkout@v2
with:
submodules: true
- uses: vmactions/freebsd-vm@v0.1.5
- uses: vmactions/freebsd-vm@v0.2.0
with:
usesh: true
prepare: pkg install -y ruby devel/ruby-gems pkgconf libxml2 libxslt
Expand Down
73 changes: 73 additions & 0 deletions .github/workflows/downstream.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
name: downstream
concurrency:
group: "${{github.workflow}}-${{github.ref}}"
cancel-in-progress: true
on:
workflow_dispatch:
schedule:
- cron: "0 8 * * 1,3,5" # At 08:00 on Monday, Wednesday, and Friday # https://crontab.guru/#0_8_*_*_1,3,5
push:
branches:
- main
- v*.*.x
tags:
- v*.*.*
pull_request:
types: [opened, synchronize]
branches:
- '*'

jobs:
downstream:
name: downstream-${{matrix.name}}
strategy:
fail-fast: false
matrix:
include:
- url: https://github.com/flavorjones/loofah
name: loofah
command: "bundle exec rake test"
- url: https://github.com/rails/rails-html-sanitizer
name: rails-html-sanitizer
command: "bundle exec rake test"
- url: https://github.com/rgrove/sanitize
name: sanitize
command: "bundle exec rake test"
- url: https://github.com/ebeigarts/signer
name: signer
command: "bundle exec rake spec"
- url: https://github.com/WinRb/Viewpoint
name: viewpoint
command: "bundle exec rspec spec"
- url: https://github.com/rails/rails
name: xmlmini
command: "cd activesupport && bundle exec rake test TESTOPTS=-n/XmlMini/"
- url: https://github.com/pythonicrubyist/creek
name: creek
command: "bundle exec rake spec"
runs-on: ubuntu-latest
container:
image: ghcr.io/sparklemotion/nokogiri-test:mri-3.1
steps:
- uses: actions/checkout@v2
with:
submodules: true
- uses: actions/cache@v2
with:
path: ports
key: ports-ubuntu-${{hashFiles('dependencies.yml', 'patches/**/*.patch')}}
- run: bundle install --local || bundle install
- run: bundle exec rake compile
- run: |
git clone --depth=1 ${{matrix.url}} ${{matrix.name}}
cd ${{matrix.name}}
if grep nokogiri Gemfile ; then
sed -i 's/\(.*nokogiri.*\)/\1, path: ".."/' Gemfile
else
echo "gem 'nokogiri', path: '..'" >> Gemfile
fi
if egrep "add_development_dependency.*\bbundler\b" *gemspec ; then
sed -i 's/.*add_development_dependency.*\bbundler\b.*//' *gemspec
fi
bundle install --local || bundle install
${{matrix.command}}
8 changes: 8 additions & 0 deletions ext/java/nokogiri/XmlReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,17 @@ public class XmlReader extends RubyObject
public IRubyObject
attribute_nodes(ThreadContext context)
{
context.runtime.getWarnings().warn("Reader#attribute_nodes is deprecated and will be removed in a future version of Nokogiri. Please use Reader#attribute_hash instead.");
return currentNode().getAttributesNodes();
}

@JRubyMethod
public IRubyObject
attribute_hash(ThreadContext context)
{
return currentNode().getAttributes(context);
}

@JRubyMethod(name = "attributes?")
public IRubyObject
attributes_p(ThreadContext context)
Expand Down
5 changes: 3 additions & 2 deletions ext/java/nokogiri/internals/ReaderNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,10 @@ public abstract class ReaderNode
getAttributes(ThreadContext context)
{
final Ruby runtime = context.runtime;
if (attributeList == null) { return runtime.getNil(); }
RubyHash hash = RubyHash.newHash(runtime);
if (attributeList == null) { return hash; }
for (int i = 0; i < attributeList.length; i++) {
if (isNamespace(attributeList.names.get(i))) { continue; }
IRubyObject k = stringOrBlank(runtime, attributeList.names.get(i));
IRubyObject v = stringOrBlank(runtime, attributeList.values.get(i));
hash.fastASetCheckString(runtime, k, v); // hash.op_aset(context, k, v)
Expand Down Expand Up @@ -150,8 +151,8 @@ public abstract class ReaderNode
getNamespaces(ThreadContext context)
{
final Ruby runtime = context.runtime;
if (namespaces == null) { return runtime.getNil(); }
RubyHash hash = RubyHash.newHash(runtime);
if (namespaces == null) { return hash; }
for (Map.Entry<String, String> entry : namespaces.entrySet()) {
IRubyObject k = stringOrBlank(runtime, entry.getKey());
IRubyObject v = stringOrBlank(runtime, entry.getValue());
Expand Down
1 change: 1 addition & 0 deletions ext/nokogiri/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,7 @@ def compile
have_func("xmlSchemaSetValidStructuredErrors") # introduced in libxml 2.6.23
have_func("xmlSchemaSetParserStructuredErrors") # introduced in libxml 2.6.23
have_func("rb_gc_location") # introduced in Ruby 2.7
have_func("rb_category_warning") # introduced in Ruby 3.0

have_func("vasprintf")

Expand Down
6 changes: 6 additions & 0 deletions ext/nokogiri/nokogiri.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ NOKOPUBFUN VALUE Nokogiri_wrap_xml_document(VALUE klass,
#define DISCARD_CONST_QUAL(t, v) ((t)(uintptr_t)(v))
#define DISCARD_CONST_QUAL_XMLCHAR(v) DISCARD_CONST_QUAL(xmlChar *, v)

#if HAVE_RB_CATEGORY_WARNING
# define NOKO_WARN_DEPRECATION(message) rb_category_warning(RB_WARN_CATEGORY_DEPRECATED, message)
#else
# define NOKO_WARN_DEPRECATION(message) rb_warning(message)
#endif

void Nokogiri_structured_error_func_save(libxmlStructuredErrorHandlerState *handler_state);
void Nokogiri_structured_error_func_save_and_set(libxmlStructuredErrorHandlerState *handler_state, void *user_data,
xmlStructuredErrorFunc handler);
Expand Down
2 changes: 1 addition & 1 deletion ext/nokogiri/xml_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -1806,7 +1806,7 @@ rb_xml_node_new(int argc, VALUE *argv, VALUE klass)
}
if (!rb_obj_is_kind_of(rb_document_node, cNokogiriXmlDocument)) {
// TODO: deprecate allowing Node
rb_warn("Passing a Node as the second parameter to Node.new is deprecated. Please pass a Document instead, or prefer an alternative constructor like Node#add_child. This will become an error in a future release of Nokogiri.");
NOKO_WARN_DEPRECATION("Passing a Node as the second parameter to Node.new is deprecated. Please pass a Document instead, or prefer an alternative constructor like Node#add_child. This will become an error in a future release of Nokogiri.");
}
Noko_Node_Get_Struct(rb_document_node, xmlNode, c_document_node);

Expand Down
53 changes: 52 additions & 1 deletion ext/nokogiri/xml_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ has_attributes(xmlTextReaderPtr reader)
return (0);
}

// TODO: merge this function into the `namespaces` method implementation
static void
Nokogiri_xml_node_namespaces(xmlNodePtr node, VALUE attr_hash)
{
Expand Down Expand Up @@ -150,7 +151,11 @@ namespaces(VALUE self)
/*
:call-seq: attribute_nodes() → Array<Nokogiri::XML::Attr>
Get the attributes of the current node as an Array of Attr
Get the attributes of the current node as an Array of XML:Attr
⚠ This method is deprecated and unsafe to use. It will be removed in a future version of Nokogiri.
See related: #attribute_hash, #attributes
*/
static VALUE
rb_xml_reader_attribute_nodes(VALUE rb_reader)
Expand All @@ -160,6 +165,10 @@ rb_xml_reader_attribute_nodes(VALUE rb_reader)
VALUE attr_nodes;
int j;

// TODO: deprecated, remove in Nokogiri v1.15, see https://github.com/sparklemotion/nokogiri/issues/2598
// After removal, we can also remove all the "node_has_a_document" special handling from xml_node.c
NOKO_WARN_DEPRECATION("Reader#attribute_nodes is deprecated and will be removed in a future version of Nokogiri. Please use Reader#attribute_hash instead.");

Data_Get_Struct(rb_reader, xmlTextReader, c_reader);

if (! has_attributes(c_reader)) {
Expand All @@ -181,6 +190,47 @@ rb_xml_reader_attribute_nodes(VALUE rb_reader)
return attr_nodes;
}

/*
:call-seq: attribute_hash() → Hash<String ⇒ String>
Get the attributes of the current node as a Hash of names and values.
See related: #attributes and #namespaces
*/
static VALUE
rb_xml_reader_attribute_hash(VALUE rb_reader)
{
VALUE rb_attributes = rb_hash_new();
xmlTextReaderPtr c_reader;
xmlNodePtr c_node;
xmlAttrPtr c_property;

Data_Get_Struct(rb_reader, xmlTextReader, c_reader);

if (!has_attributes(c_reader)) {
return rb_attributes;
}

c_node = xmlTextReaderExpand(c_reader);
c_property = c_node->properties;
while (c_property != NULL) {
VALUE rb_name = NOKOGIRI_STR_NEW2(c_property->name);
VALUE rb_value = Qnil;
xmlChar *c_value = xmlNodeGetContent((xmlNode *)c_property);

if (c_value) {
rb_value = NOKOGIRI_STR_NEW2(c_value);
xmlFree(c_value);
}

rb_hash_aset(rb_attributes, rb_name, rb_value);

c_property = c_property->next;
}

return rb_attributes;
}

/*
* call-seq:
* attribute_at(index)
Expand Down Expand Up @@ -696,6 +746,7 @@ noko_init_xml_reader()
rb_define_method(cNokogiriXmlReader, "attribute_at", attribute_at, 1);
rb_define_method(cNokogiriXmlReader, "attribute_count", attribute_count, 0);
rb_define_method(cNokogiriXmlReader, "attribute_nodes", rb_xml_reader_attribute_nodes, 0);
rb_define_method(cNokogiriXmlReader, "attribute_hash", rb_xml_reader_attribute_hash, 0);
rb_define_method(cNokogiriXmlReader, "attributes?", attributes_eh, 0);
rb_define_method(cNokogiriXmlReader, "base_uri", rb_xml_reader_base_uri, 0);
rb_define_method(cNokogiriXmlReader, "default?", default_eh, 0);
Expand Down
14 changes: 6 additions & 8 deletions lib/nokogiri/xml/reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,14 @@ def initialize(source, url = nil, encoding = nil) # :nodoc:
end
private :initialize

# Get the attributes of the current node as a Hash
# Get the attributes and namespaces of the current node as a Hash.
#
# [Returns] (Hash<String, String>) Attribute names and values
# This is the union of Reader#attribute_hash and Reader#namespaces
#
# [Returns]
# (Hash<String, String>) Attribute names and values, and namespace prefixes and hrefs.
def attributes
attrs_hash = attribute_nodes.each_with_object({}) do |node, hash|
hash[node.name] = node.to_s
end
ns = namespaces
attrs_hash.merge!(ns) if ns
attrs_hash
attribute_hash.merge(namespaces)
end

###
Expand Down
81 changes: 63 additions & 18 deletions test/xml/test_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -228,23 +228,61 @@ def test_attributes?
reader.map(&:attributes?))
end

def test_attributes
reader = Nokogiri::XML::Reader.from_memory(<<-eoxml)
<x xmlns:tenderlove='http://tenderlovemaking.com/'
xmlns='http://mothership.connection.com/'
>
<tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
</x>
eoxml
def test_reader_attributes
reader = Nokogiri::XML::Reader.from_memory(<<~XML)
<x xmlns:tenderlove='http://tenderlovemaking.com/' xmlns='http://mothership.connection.com/'>
<tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
</x>
XML
assert_empty(reader.attributes)
assert_equal([{ "xmlns:tenderlove" => "http://tenderlovemaking.com/",
"xmlns" => "http://mothership.connection.com/", },
{}, { "awesome" => "true" }, {}, { "awesome" => "true" }, {},
{},
{ "awesome" => "true" },
{},
{ "awesome" => "true" },
{},
{ "xmlns:tenderlove" => "http://tenderlovemaking.com/",
"xmlns" => "http://mothership.connection.com/", },],
reader.map(&:attributes))
end

def test_reader_attributes_hash
reader = Nokogiri::XML::Reader.from_memory(<<~XML)
<x xmlns:tenderlove='http://tenderlovemaking.com/' xmlns='http://mothership.connection.com/'>
<tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
</x>
XML
assert_empty(reader.attribute_hash)
assert_equal([{},
{},
{ "awesome" => "true" },
{},
{ "awesome" => "true" },
{},
{},],
reader.map(&:attribute_hash))
end

def test_reader_namespaces
reader = Nokogiri::XML::Reader.from_memory(<<~XML)
<x xmlns:tenderlove='http://tenderlovemaking.com/' xmlns='http://mothership.connection.com/'>
<tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
</x>
XML
assert_empty(reader.namespaces)
assert_equal([{ "xmlns:tenderlove" => "http://tenderlovemaking.com/",
"xmlns" => "http://mothership.connection.com/", },
{},
{},
{},
{},
{},
{ "xmlns:tenderlove" => "http://tenderlovemaking.com/",
"xmlns" => "http://mothership.connection.com/", },],
reader.map(&:namespaces))
end

def test_attribute_roundtrip
reader = Nokogiri::XML::Reader.from_memory(<<-eoxml)
<x xmlns:tenderlove='http://tenderlovemaking.com/'
Expand Down Expand Up @@ -434,7 +472,9 @@ def test_reader_node_attributes_keep_a_reference_to_the_reader

reader = Nokogiri::XML::Reader.from_memory(xml)
reader.each do |element|
attribute_nodes += element.attribute_nodes
assert_output(nil, /Reader#attribute_nodes is deprecated/) do
attribute_nodes += element.attribute_nodes
end
end
end

Expand All @@ -451,7 +491,9 @@ def test_namespaced_attributes
eoxml
attr_ns = []
while reader.read
if reader.node_type == Nokogiri::XML::Node::ELEMENT_NODE
next unless reader.node_type == Nokogiri::XML::Node::ELEMENT_NODE

assert_output(nil, /Reader#attribute_nodes is deprecated/) do
reader.attribute_nodes.each { |attr| attr_ns << (attr.namespace.nil? ? nil : attr.namespace.prefix) }
end
end
Expand Down Expand Up @@ -555,14 +597,17 @@ def test_read_from_memory
end

def test_large_document_smoke_test
# simply run on a large document to verify that there no GC issues
xml = []
xml << "<elements>"
10000.times { |j| xml << "<element id=\"#{j}\"/>" }
xml << "</elements>"
xml = xml.join("\n")
skip_unless_libxml2("valgrind tests should only run with libxml2")

Nokogiri::XML::Reader.from_memory(xml).each(&:attributes)
refute_valgrind_errors do
xml = []
xml << "<elements>"
10000.times { |j| xml << "<element id=\"#{j}\"/>" }
xml << "</elements>"
xml = xml.join("\n")

Nokogiri::XML::Reader.from_memory(xml).each(&:attributes)
end
end

def test_correct_outer_xml_inclusion
Expand Down

0 comments on commit d6c4a2b

Please sign in to comment.