diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ecbcfd..dc17058 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 4.1.0 + - Feat: added parser_options for more control over XML parsing [#68](https://github.com/logstash-plugins/logstash-filter-xml/pull/68) + ## 4.0.7 - Fixed creation of empty arrays when xpath failed [#59](https://github.com/logstash-plugins/logstash-filter-xml/pull/59) diff --git a/docs/index.asciidoc b/docs/index.asciidoc index 290ef46..437eb52 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -34,6 +34,7 @@ This plugin supports the following configuration options plus the <> |<>|No | <> |<>|No | <> |<>|No +| <> |<>|No | <> |<>|No | <> |<>|Yes | <> |<>|No @@ -87,6 +88,17 @@ filter { } } +[id="plugins-{type}s-{plugin}-parser_options"] +===== `parser_options` + + * Value type is <> + * There is no default value for this setting. + +Setting XML parser options allows for more control of the parsing process. +By default the parser is not strict and thus accepts some invalid content. +Currently supported options are: + + - `strict` - forces the parser to fail early instead of accumulating errors when content is not valid xml. [id="plugins-{type}s-{plugin}-remove_namespaces"] ===== `remove_namespaces` diff --git a/lib/logstash/filters/xml.rb b/lib/logstash/filters/xml.rb index f8e168e..6746ddd 100644 --- a/lib/logstash/filters/xml.rb +++ b/lib/logstash/filters/xml.rb @@ -58,6 +58,13 @@ class LogStash::Filters::Xml < LogStash::Filters::Base # config :xpath, :validate => :hash, :default => {} + # Supported XML parsing options are 'strict', 'no_error' and 'no_warning'. + # - strict mode turns on strict parsing rules (non-compliant xml will fail) + # - no_error and no_warning can be used to suppress errors/warnings + config :parse_options, :validate => :string + # NOTE: technically we support more but we purposefully do not document those. + # e.g. setting "strict|recover" will not turn on strict as they're conflicting + # By default the filter will store the whole parsed XML in the destination # field as described above. Setting this to false will prevent that. config :store_xml, :validate => :boolean, :default => true @@ -110,6 +117,7 @@ def register :error => "When the 'store_xml' configuration option is true, 'target' must also be set" ) end + xml_parse_options # validates parse_options => ... end def filter(event) @@ -141,11 +149,13 @@ def filter(event) if @xpath begin - doc = Nokogiri::XML(value, nil, value.encoding.to_s) + doc = Nokogiri::XML::Document.parse(value, nil, value.encoding.to_s, xml_parse_options) rescue => e event.tag(XMLPARSEFAILURE_TAG) @logger.warn("Error parsing xml", :source => @source, :value => value, :exception => e, :backtrace => e.backtrace) return + else + doc.errors.any? && @logger.debug? && @logger.debug("Parsed xml with #{doc.errors.size} errors") end doc.remove_namespaces! if @remove_namespaces @@ -194,4 +204,26 @@ def filter(event) filter_matched(event) if matched @logger.debug? && @logger.debug("Event after xml filter", :event => event) end + + private + + def xml_parse_options + return Nokogiri::XML::ParseOptions::DEFAULT_XML unless @parse_options # (RECOVER | NONET) + @xml_parse_options ||= begin + parse_options = @parse_options.split(/,|\|/).map do |opt| + name = opt.strip.tr('_', '').upcase + if name.empty? + nil + else + begin + Nokogiri::XML::ParseOptions.const_get(name) + rescue NameError + raise LogStash::ConfigurationError, "unsupported parse option: #{opt.inspect}" + end + end + end + parse_options.compact.inject(0, :|) # e.g. NOERROR | NOWARNING + end + end + end diff --git a/logstash-filter-xml.gemspec b/logstash-filter-xml.gemspec index a7f4589..169e3de 100644 --- a/logstash-filter-xml.gemspec +++ b/logstash-filter-xml.gemspec @@ -1,7 +1,7 @@ Gem::Specification.new do |s| s.name = 'logstash-filter-xml' - s.version = '4.0.7' + s.version = '4.1.0' s.licenses = ['Apache License (2.0)'] s.summary = "Parses XML into fields" s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program" diff --git a/spec/filters/xml_spec.rb b/spec/filters/xml_spec.rb index fde66cc..70c6735 100644 --- a/spec/filters/xml_spec.rb +++ b/spec/filters/xml_spec.rb @@ -418,4 +418,63 @@ end end end + + describe "parsing invalid xml" do + subject { described_class.new(options) } + let(:options) { ({ 'source' => 'xmldata', 'store_xml' => false }) } + let(:xmldata) { " " } + let(:event) { LogStash::Event.new(data) } + let(:data) { { "xmldata" => xmldata } } + + before { subject.register } + after { subject.close } + + it 'does not fail (by default)' do + subject.filter(event) + expect( event.get("tags") ).to be nil + end + + context 'strict option' do + let(:options) { super.merge({ 'parse_options' => 'strict' }) } + + it 'does fail parsing' do + subject.filter(event) + expect( event.get("tags") ).to_not be nil + expect( event.get("tags") ).to include '_xmlparsefailure' + end + end + end + + describe "parse_options" do + subject { described_class.new(options) } + let(:options) { ({ 'source' => 'xmldata', 'store_xml' => false, 'parse_options' => parse_options }) } + + context 'strict (supported option)' do + let(:parse_options) { 'strict' } + + it 'registers filter' do + subject.register + expect( subject.send(:xml_parse_options) ). + to eql Nokogiri::XML::ParseOptions::STRICT + end + end + + context 'valid' do + let(:parse_options) { 'no_error,NOWARNING' } + + it 'registers filter' do + subject.register + expect( subject.send(:xml_parse_options) ). + to eql Nokogiri::XML::ParseOptions::NOERROR | Nokogiri::XML::ParseOptions::NOWARNING + end + end + + context 'invalid' do + let(:parse_options) { 'strict,invalid0' } + + it 'fails to register' do + expect { subject.register }.to raise_error(LogStash::ConfigurationError, 'unsupported parse option: "invalid0"') + end + end + end end