From 771164daed8a1c6b902c7ca086ba479729b13f72 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 28 Dec 2020 12:59:46 -0500 Subject: [PATCH] fix(cruby): stop clobbering libxml2 error handler on SAX parser init This was leading to loss of error capture on extremely short HTML docs when encoding was not passed by the caller. This call was introduced in d23fe2c (#87) for reasons that are unclear, but we've come a long way with how we manage the global error handlers and so I think we're OK to stop doing this now. --- CHANGELOG.md | 1 + ext/nokogiri/xml_sax_parser.c | 2 -- test/html/test_document_encoding.rb | 11 +++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 203f8d188d..0796e46b13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -112,6 +112,7 @@ See note below about CVE-2020-26247 in the "Changed" subsection entitled "XML::S * [JRuby] XML::Schema XSD validation errors are captured in `XML::Schema#errors`. These errors were previously ignored. * [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)] * [JRuby] Comparison of Node to Document with `Node#<=>` now matches CRuby/libxml2 behavior. +* [CRuby] Syntax errors are now correctly captured in `Document#errors` for short HTML documents. Previously the SAX parser used for encoding detection was clobbering libxml2's global error handler. * [CRuby] Fixed installation on AIX with respect to `vasprintf`. [[#1908](https://github.com/sparklemotion/nokogiri/issues/1908)] * [CRuby] On some platforms, avoid symbol name collision with glibc's `canonicalize`. [[#2105](https://github.com/sparklemotion/nokogiri/issues/2105)] * [Windows Visual C++] Fixed compiler warnings and errors. [[#2061](https://github.com/sparklemotion/nokogiri/issues/2061), [#2068](https://github.com/sparklemotion/nokogiri/issues/2068)] diff --git a/ext/nokogiri/xml_sax_parser.c b/ext/nokogiri/xml_sax_parser.c index 0c7d45ec30..de8f3813b1 100644 --- a/ext/nokogiri/xml_sax_parser.c +++ b/ext/nokogiri/xml_sax_parser.c @@ -259,8 +259,6 @@ static VALUE allocate(VALUE klass) { xmlSAXHandlerPtr handler = calloc((size_t)1, sizeof(xmlSAXHandler)); - xmlSetStructuredErrorFunc(NULL, NULL); - handler->startDocument = start_document; handler->endDocument = end_document; handler->startElement = start_element; diff --git a/test/html/test_document_encoding.rb b/test/html/test_document_encoding.rb index 49f6747f30..6238ebfc92 100644 --- a/test/html/test_document_encoding.rb +++ b/test/html/test_document_encoding.rb @@ -142,6 +142,17 @@ def binopen(file) assert_equal(evil, ary_from_file) end end + + describe "error handling" do + RAW = " RAW, "read_io" => StringIO.new(RAW)}.each do |flavor, input| + it "#{flavor} should handle errors" do + doc = Nokogiri::HTML.parse(input) + assert_operator(doc.errors.length, :>, 0) + end + end + end end end end