From 01b950d224eeafd4e4c59a6b55070bbcd643afe7 Mon Sep 17 00:00:00 2001 From: Brian Floersch Date: Thu, 26 Sep 2024 17:21:32 -0400 Subject: [PATCH] Auto Multiline v2 - Add Legacy Regex Support (#29573) --- .../auto_multiline_detection/user_samples.go | 52 ++++++++++--- .../user_samples_test.go | 77 +++++++++++++++++++ 2 files changed, 118 insertions(+), 11 deletions(-) diff --git a/pkg/logs/internal/decoder/auto_multiline_detection/user_samples.go b/pkg/logs/internal/decoder/auto_multiline_detection/user_samples.go index 25a22c209ecc71..0639c6641cc3e4 100644 --- a/pkg/logs/internal/decoder/auto_multiline_detection/user_samples.go +++ b/pkg/logs/internal/decoder/auto_multiline_detection/user_samples.go @@ -7,6 +7,8 @@ package automultilinedetection import ( + "regexp" + "github.com/DataDog/datadog-agent/pkg/config/model" "github.com/DataDog/datadog-agent/pkg/logs/internal/decoder/auto_multiline_detection/tokens" "github.com/DataDog/datadog-agent/pkg/util/log" @@ -22,6 +24,8 @@ type UserSample struct { // From a user perspective, this is how similar the log has to be to the sample to be considered a match. // Optional - Default value is 0.75. MatchThreshold *float64 `mapstructure:"match_threshold,omitempty"` + // Regex is a pattern used to aggregate logs. NOTE that you can use either a sample or a regex, but not both. + Regex string `mapstructure:"regex,omitempty"` // Label is the label to apply to the log message if it matches the sample. // Optional - Default value is "start_group". Label *string `mapstructure:"label,omitempty"` @@ -30,6 +34,7 @@ type UserSample struct { tokens []tokens.Token matchThreshold float64 label Label + compiledRegex *regexp.Regexp } // UserSamples is a heuristic that represents a collection of user-defined samples for auto multi-line aggreagtion. @@ -50,21 +55,40 @@ func NewUserSamples(config model.Reader) *UserSamples { } } + legacyAdditionalPatterns := config.GetStringSlice("logs_config.auto_multi_line_extra_patterns") + if len(legacyAdditionalPatterns) > 0 { + log.Warn("Found deprecated logs_config.auto_multi_line_extra_patterns converting to logs_config.auto_multi_line_detection_custom_samples") + for _, pattern := range legacyAdditionalPatterns { + s = append(s, &UserSample{ + Regex: pattern, + }) + } + } + parsedSamples := make([]*UserSample, 0, len(s)) for _, sample := range s { - if sample.Sample == "" { - log.Warn("Sample was empty, skipping sample") - continue - } - sample.tokens, _ = tokenizer.tokenize([]byte(sample.Sample)) - if sample.MatchThreshold != nil { - if *sample.MatchThreshold <= 0 || *sample.MatchThreshold > 1 { - log.Warnf("Invalid match threshold %f, skipping sample", *sample.MatchThreshold) + if sample.Sample != "" { + sample.tokens, _ = tokenizer.tokenize([]byte(sample.Sample)) + + if sample.MatchThreshold != nil { + if *sample.MatchThreshold <= 0 || *sample.MatchThreshold > 1 { + log.Warnf("Invalid match threshold %f, skipping sample", *sample.MatchThreshold) + continue + } + sample.matchThreshold = *sample.MatchThreshold + } else { + sample.matchThreshold = defaultMatchThreshold + } + } else if sample.Regex != "" { + compiled, err := regexp.Compile("^" + sample.Regex) + if err != nil { + log.Warn(sample.Regex, " is not a valid regular expression - skipping") continue } - sample.matchThreshold = *sample.MatchThreshold + sample.compiledRegex = compiled } else { - sample.matchThreshold = defaultMatchThreshold + log.Warn("Sample and regex was empty, skipping") + continue } if sample.Label != nil { @@ -100,7 +124,13 @@ func (j *UserSamples) ProcessAndContinue(context *messageContext) bool { } for _, sample := range j.samples { - if isMatch(sample.tokens, context.tokens, sample.matchThreshold) { + if sample.compiledRegex != nil { + if sample.compiledRegex.Match(context.rawMessage) { + context.label = sample.label + context.labelAssignedBy = "user_sample" + return false + } + } else if isMatch(sample.tokens, context.tokens, sample.matchThreshold) { context.label = sample.label context.labelAssignedBy = "user_sample" return false diff --git a/pkg/logs/internal/decoder/auto_multiline_detection/user_samples_test.go b/pkg/logs/internal/decoder/auto_multiline_detection/user_samples_test.go index 33f15698d0e7a9..db74b9efe4d554 100644 --- a/pkg/logs/internal/decoder/auto_multiline_detection/user_samples_test.go +++ b/pkg/logs/internal/decoder/auto_multiline_detection/user_samples_test.go @@ -180,3 +180,80 @@ logs_config: assert.Equal(t, test.expectedLabel, context.label, "Expected label %v, got %v", test.expectedLabel, context.label) } } + +func TestUserPatternsRegexProcess(t *testing.T) { + + datadogYaml := ` +logs_config: + auto_multi_line_extra_patterns: + - "le\\wacy" + auto_multi_line_detection_custom_samples: + - regex: "(foo|bar)test\\d+" +` + + mockConfig := mock.NewFromYAML(t, datadogYaml) + samples := NewUserSamples(mockConfig) + tokenizer := NewTokenizer(60) + + tests := []struct { + expectedLabel Label + shouldStop bool + input string + }{ + {aggregate, true, ""}, + {aggregate, true, "some random log line"}, + {aggregate, true, "2023-03-28T14:33:53.743350Z App started successfully"}, + {startGroup, false, "footest123 some other log line"}, + {startGroup, false, "bartest123 some other log line"}, + {startGroup, false, "legacy pattern should match me"}, + {aggregate, true, "!!![$Not_close_enough%] some other log line"}, + } + + for _, test := range tests { + context := &messageContext{ + rawMessage: []byte(test.input), + label: aggregate, + } + + assert.True(t, tokenizer.ProcessAndContinue(context)) + assert.Equal(t, test.shouldStop, samples.ProcessAndContinue(context), "Expected stop %v, got %v", test.shouldStop, samples.ProcessAndContinue(context)) + assert.Equal(t, test.expectedLabel, context.label, "Expected label %v, got %v", test.expectedLabel, context.label) + } +} + +func TestUserPatternsProcessRegexCustomSettings(t *testing.T) { + + datadogYaml := ` +logs_config: + auto_multi_line_detection_custom_samples: + - regex: "(foo|bar)test\\d+" + label: no_aggregate +` + + mockConfig := mock.NewFromYAML(t, datadogYaml) + samples := NewUserSamples(mockConfig) + tokenizer := NewTokenizer(60) + + tests := []struct { + expectedLabel Label + shouldStop bool + input string + }{ + {aggregate, true, ""}, + {aggregate, true, "some random log line"}, + {aggregate, true, "2023-03-28T14:33:53.743350Z App started successfully"}, + {noAggregate, false, "footest123 some other log line"}, + {noAggregate, false, "bartest123 some other log line"}, + } + + for _, test := range tests { + context := &messageContext{ + rawMessage: []byte(test.input), + label: aggregate, + } + + assert.True(t, tokenizer.ProcessAndContinue(context)) + assert.Equal(t, test.shouldStop, samples.ProcessAndContinue(context), "Expected stop %v, got %v", test.shouldStop, samples.ProcessAndContinue(context)) + assert.Equal(t, test.expectedLabel, context.label, "Expected label %v, got %v", test.expectedLabel, context.label) + } +}