From 863357f742d96d78556b09d96a8312235611e5eb Mon Sep 17 00:00:00 2001 From: toyama0919 Date: Mon, 17 Aug 2015 15:28:03 +0900 Subject: [PATCH 1/7] Add null_value option for LabeledTSVParser. --- lib/fluent/parser.rb | 4 ++++ test/test_parser.rb | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/lib/fluent/parser.rb b/lib/fluent/parser.rb index 30168cf0a6..d0de62bc48 100644 --- a/lib/fluent/parser.rb +++ b/lib/fluent/parser.rb @@ -367,6 +367,7 @@ def parse(text) class LabeledTSVParser < ValuesParser config_param :delimiter, :string, :default => "\t" config_param :label_delimiter, :string, :default => ":" + config_param :null_value, :string, :default => nil config_param :time_key, :string, :default => "time" def configure(conf) @@ -381,6 +382,9 @@ def parse(text) text.split(delimiter).each do |pair| key, value = pair.split(label_delimiter, 2) @keys.push(key) + if null_value + value = (null_value == value) ? nil : value + end values.push(value) end diff --git a/test/test_parser.rb b/test/test_parser.rb index 378702d184..a90249908a 100644 --- a/test/test_parser.rb +++ b/test/test_parser.rb @@ -684,6 +684,16 @@ def test_parse_with_keep_time_key assert_equal text, record['time'] end end + + def test_parse_with_nil_string + parser = TextParser::LabeledTSVParser.new + parser.configure( + 'null_value'=>'-' + ) + parser.parse("user_id:-") do |time, record| + assert_nil record['user_id'] + end + end end class NoneParserTest < ::Test::Unit::TestCase From 78a7bd6ada25fa716a366d85c4fa2a5e2de64f85 Mon Sep 17 00:00:00 2001 From: toyama0919 Date: Mon, 17 Aug 2015 17:45:52 +0900 Subject: [PATCH 2/7] add null_value_pattern and null_empty_string option for LabeledTSVParser. remove null_value option. --- lib/fluent/parser.rb | 10 +++++++--- test/test_parser.rb | 24 ++++++++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/lib/fluent/parser.rb b/lib/fluent/parser.rb index d0de62bc48..b3a7a6348c 100644 --- a/lib/fluent/parser.rb +++ b/lib/fluent/parser.rb @@ -367,7 +367,8 @@ def parse(text) class LabeledTSVParser < ValuesParser config_param :delimiter, :string, :default => "\t" config_param :label_delimiter, :string, :default => ":" - config_param :null_value, :string, :default => nil + config_param :null_value_pattern, :string, :default => nil + config_param :null_empty_string, :bool, :default => false config_param :time_key, :string, :default => "time" def configure(conf) @@ -382,8 +383,11 @@ def parse(text) text.split(delimiter).each do |pair| key, value = pair.split(label_delimiter, 2) @keys.push(key) - if null_value - value = (null_value == value) ? nil : value + if null_empty_string + value = (value == '') ? nil : value + end + if null_value_pattern + value = (value =~ /#{null_value_pattern}/) ? nil : value end values.push(value) end diff --git a/test/test_parser.rb b/test/test_parser.rb index a90249908a..835bef763a 100644 --- a/test/test_parser.rb +++ b/test/test_parser.rb @@ -685,13 +685,29 @@ def test_parse_with_keep_time_key end end - def test_parse_with_nil_string + def test_parse_with_null_value_pattern parser = TextParser::LabeledTSVParser.new parser.configure( - 'null_value'=>'-' + 'null_value_pattern'=>'^(-|null|NULL)$' ) - parser.parse("user_id:-") do |time, record| - assert_nil record['user_id'] + parser.parse("a:-\tb:null\tc:NULL\td:\te:--\tf:nuLL") do |time, record| + assert_nil record['a'] + assert_nil record['b'] + assert_nil record['c'] + assert_equal record['d'], '' + assert_equal record['e'], '--' + assert_equal record['f'], 'nuLL' + end + end + + def test_parse_with_null_empty_string + parser = TextParser::LabeledTSVParser.new + parser.configure( + 'null_empty_string'=>true + ) + parser.parse("a:\tb: ") do |time, record| + assert_nil record['a'] + assert_equal record['b'], ' ' end end end From e8841911ba0f8d686b5336c13a9ba53236ccdc23 Mon Sep 17 00:00:00 2001 From: toyama0919 Date: Tue, 18 Aug 2015 20:12:20 +0900 Subject: [PATCH 3/7] add null_value_pattern and null_empty_string option for ValuesParser. move this option LabeledTSVParser to ValuesParser. --- lib/fluent/parser.rb | 27 ++++++++++++------- test/test_parser.rb | 64 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/lib/fluent/parser.rb b/lib/fluent/parser.rb index b3a7a6348c..b3867baf3b 100644 --- a/lib/fluent/parser.rb +++ b/lib/fluent/parser.rb @@ -295,6 +295,8 @@ class ValuesParser < Parser end config_param :time_key, :string, :default => nil config_param :time_format, :string, :default => nil + config_param :null_value_pattern, :string, :default => nil + config_param :null_empty_string, :bool, :default => false def configure(conf) super @@ -308,11 +310,16 @@ def configure(conf) end @time_parser = TimeParser.new(@time_format) + + if @null_value_pattern + @null_value_pattern = /#{@null_value_pattern}/ + end + @mutex = Mutex.new end def values_map(values) - record = Hash[keys.zip(values)] + record = Hash[keys.zip(values.map { |value| convert_value_to_nil(value) })] if @time_key value = @keep_time_key ? record[@time_key] : record.delete(@time_key) @@ -345,6 +352,16 @@ def convert_field_type!(record) end } end + + def convert_value_to_nil(value) + if @null_empty_string + value = (value == '') ? nil : value + end + if @null_value_pattern + value = (value =~ @null_value_pattern) ? nil : value + end + value + end end class TSVParser < ValuesParser @@ -367,8 +384,6 @@ def parse(text) class LabeledTSVParser < ValuesParser config_param :delimiter, :string, :default => "\t" config_param :label_delimiter, :string, :default => ":" - config_param :null_value_pattern, :string, :default => nil - config_param :null_empty_string, :bool, :default => false config_param :time_key, :string, :default => "time" def configure(conf) @@ -383,12 +398,6 @@ def parse(text) text.split(delimiter).each do |pair| key, value = pair.split(label_delimiter, 2) @keys.push(key) - if null_empty_string - value = (value == '') ? nil : value - end - if null_value_pattern - value = (value =~ /#{null_value_pattern}/) ? nil : value - end values.push(value) end diff --git a/test/test_parser.rb b/test/test_parser.rb index 835bef763a..ead7d095fc 100644 --- a/test/test_parser.rb +++ b/test/test_parser.rb @@ -529,6 +529,38 @@ def test_parse_with_keep_time_key assert_equal text, record['time'] end end + + data('array param' => '["a","b","c","d","e","f"]', 'string param' => 'a,b,c,d,e,f') + def test_parse_with_null_value_pattern + parser = TextParser::TSVParser.new + parser.configure( + 'keys'=>param, + 'time_key'=>'time', + 'null_value_pattern'=>'^(-|null|NULL)$' + ) + parser.parse("-\tnull\tNULL\t\t--\tnuLL") do |time, record| + assert_nil record['a'] + assert_nil record['b'] + assert_nil record['c'] + assert_equal record['d'], '' + assert_equal record['e'], '--' + assert_equal record['f'], 'nuLL' + end + end + + data('array param' => '["a","b"]', 'string param' => 'a,b') + def test_parse_with_null_empty_string + parser = TextParser::TSVParser.new + parser.configure( + 'keys'=>param, + 'time_key'=>'time', + 'null_empty_string'=>true + ) + parser.parse("\t ") do |time, record| + assert_nil record['a'] + assert_equal record['b'], ' ' + end + end end class CSVParserTest < ::Test::Unit::TestCase @@ -586,6 +618,38 @@ def test_parse_with_keep_time_key assert_equal text, record['time'] end end + + data('array param' => '["a","b","c","d","e","f"]', 'string param' => 'a,b,c,d,e,f') + def test_parse_with_null_value_pattern + parser = TextParser::CSVParser.new + parser.configure( + 'keys'=>param, + 'time_key'=>'time', + 'null_value_pattern'=>'^(-|null|NULL)$' + ) + parser.parse("-,null,NULL,,--,nuLL") do |time, record| + assert_nil record['a'] + assert_nil record['b'] + assert_nil record['c'] + assert_equal record['d'], '' + assert_equal record['e'], '--' + assert_equal record['f'], 'nuLL' + end + end + + data('array param' => '["a","b"]', 'string param' => 'a,b') + def test_parse_with_null_empty_string + parser = TextParser::CSVParser.new + parser.configure( + 'keys'=>param, + 'time_key'=>'time', + 'null_empty_string'=>true + ) + parser.parse(", ") do |time, record| + assert_nil record['a'] + assert_equal record['b'], ' ' + end + end end class LabeledTSVParserTest < ::Test::Unit::TestCase From 1227ebef5fcfe9f8196357a5dd9479d032eca231 Mon Sep 17 00:00:00 2001 From: toyama0919 Date: Wed, 19 Aug 2015 15:26:01 +0900 Subject: [PATCH 4/7] fix. use Regexp.new with @null_value_pattern. --- lib/fluent/parser.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/fluent/parser.rb b/lib/fluent/parser.rb index b3867baf3b..d654348ec9 100644 --- a/lib/fluent/parser.rb +++ b/lib/fluent/parser.rb @@ -312,7 +312,7 @@ def configure(conf) @time_parser = TimeParser.new(@time_format) if @null_value_pattern - @null_value_pattern = /#{@null_value_pattern}/ + @null_value_pattern = Regexp.new(@null_value_pattern) end @mutex = Mutex.new @@ -358,7 +358,7 @@ def convert_value_to_nil(value) value = (value == '') ? nil : value end if @null_value_pattern - value = (value =~ @null_value_pattern) ? nil : value + value = (@null_value_pattern.match(value)) ? nil : value end value end From a4487a981580921342b9b149219a3aa4a53ea7fe Mon Sep 17 00:00:00 2001 From: toyama0919 Date: Wed, 19 Aug 2015 15:32:25 +0900 Subject: [PATCH 5/7] fix. does not process if the nil value. --- lib/fluent/parser.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/fluent/parser.rb b/lib/fluent/parser.rb index d654348ec9..0af4219520 100644 --- a/lib/fluent/parser.rb +++ b/lib/fluent/parser.rb @@ -354,10 +354,10 @@ def convert_field_type!(record) end def convert_value_to_nil(value) - if @null_empty_string + if value and @null_empty_string value = (value == '') ? nil : value end - if @null_value_pattern + if value and @null_value_pattern value = (@null_value_pattern.match(value)) ? nil : value end value From 032f640c819d0b68bbfe7d7e5cf4e471a209df4e Mon Sep 17 00:00:00 2001 From: toyama0919 Date: Wed, 19 Aug 2015 15:34:41 +0900 Subject: [PATCH 6/7] fix. use String#scrub if include invalid byte sequence. --- lib/fluent/parser.rb | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/fluent/parser.rb b/lib/fluent/parser.rb index 0af4219520..f959298f5e 100644 --- a/lib/fluent/parser.rb +++ b/lib/fluent/parser.rb @@ -358,10 +358,22 @@ def convert_value_to_nil(value) value = (value == '') ? nil : value end if value and @null_value_pattern - value = (@null_value_pattern.match(value)) ? nil : value + value = match(value) ? nil : value end value end + + def match(string) + begin + return @null_value_pattern.match(string) + rescue ArgumentError => e + raise e unless e.message.index("invalid byte sequence in".freeze).zero? + log.info "invalid byte sequence is replaced in `#{string}`" + string = string.scrub('?') + retry + end + return true + end end class TSVParser < ValuesParser From 0ec1c5316b7166549769fb1fa6dc2e9202251ac2 Mon Sep 17 00:00:00 2001 From: toyama0919 Date: Wed, 19 Aug 2015 18:42:03 +0900 Subject: [PATCH 7/7] fix. create StringUtil, scrub and regexp match to common process. --- lib/fluent/parser.rb | 14 +------------- lib/fluent/plugin/filter_grep.rb | 23 +++++++---------------- lib/fluent/plugin/string_util.rb | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 29 deletions(-) create mode 100644 lib/fluent/plugin/string_util.rb diff --git a/lib/fluent/parser.rb b/lib/fluent/parser.rb index f959298f5e..7f06bca353 100644 --- a/lib/fluent/parser.rb +++ b/lib/fluent/parser.rb @@ -358,22 +358,10 @@ def convert_value_to_nil(value) value = (value == '') ? nil : value end if value and @null_value_pattern - value = match(value) ? nil : value + value = ::Fluent::StringUtil.match_regexp(@null_value_pattern, value) ? nil : value end value end - - def match(string) - begin - return @null_value_pattern.match(string) - rescue ArgumentError => e - raise e unless e.message.index("invalid byte sequence in".freeze).zero? - log.info "invalid byte sequence is replaced in `#{string}`" - string = string.scrub('?') - retry - end - return true - end end class TSVParser < ValuesParser diff --git a/lib/fluent/plugin/filter_grep.rb b/lib/fluent/plugin/filter_grep.rb index 4f938455e3..704d5d7d6f 100644 --- a/lib/fluent/plugin/filter_grep.rb +++ b/lib/fluent/plugin/filter_grep.rb @@ -18,6 +18,11 @@ module Fluent class GrepFilter < Filter Fluent::Plugin.register_filter('grep', self) + def initialize + super + require 'fluent/plugin/string_util' + end + REGEXP_MAX_NUM = 20 (1..REGEXP_MAX_NUM).each {|i| config_param :"regexp#{i}", :string, :default => nil } @@ -54,10 +59,10 @@ def filter(tag, time, record) begin catch(:break_loop) do @regexps.each do |key, regexp| - throw :break_loop unless match(regexp, record[key].to_s) + throw :break_loop unless ::Fluent::StringUtil.match_regexp(regexp, record[key].to_s) end @excludes.each do |key, exclude| - throw :break_loop if match(exclude, record[key].to_s) + throw :break_loop if ::Fluent::StringUtil.match_regexp(exclude, record[key].to_s) end result = record end @@ -67,19 +72,5 @@ def filter(tag, time, record) end result end - - private - - def match(regexp, string) - begin - return regexp.match(string) - rescue ArgumentError => e - raise e unless e.message.index("invalid byte sequence in".freeze).zero? - log.info "invalid byte sequence is replaced in `#{string}`" - string = string.scrub('?') - retry - end - return true - end end end diff --git a/lib/fluent/plugin/string_util.rb b/lib/fluent/plugin/string_util.rb new file mode 100644 index 0000000000..73e419aa50 --- /dev/null +++ b/lib/fluent/plugin/string_util.rb @@ -0,0 +1,32 @@ +# +# Fluentd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +module Fluent + module StringUtil + def match_regexp(regexp, string) + begin + return regexp.match(string) + rescue ArgumentError => e + raise e unless e.message.index("invalid byte sequence in".freeze).zero? + log.info "invalid byte sequence is replaced in `#{string}`" + string = string.scrub('?') + retry + end + return true + end + module_function :match_regexp + end +end