From 2ffe032d299e2bbe7f0f46ad33b2b370ee0259df Mon Sep 17 00:00:00 2001 From: Dominik Rosiek <58699848+sumo-drosiek@users.noreply.github.com> Date: Fri, 9 Jul 2021 14:26:22 +0200 Subject: [PATCH] feat: trim whitechars for file with multiline (#212) * feat: strip whitechars in multiline helper Signed-off-by: Dominik Rosiek * fix(multiline): use proper advance and strip new line whitespaces from beginning of log Signed-off-by: Dominik Rosiek * test(multiline): add multiple multiline logs case Signed-off-by: Dominik Rosiek --- operator/helper/multiline.go | 31 +++++++++++++++++++++++-------- operator/helper/multiline_test.go | 30 ++++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/operator/helper/multiline.go b/operator/helper/multiline.go index a6820f9faea2..7aed4e2a3a89 100644 --- a/operator/helper/multiline.go +++ b/operator/helper/multiline.go @@ -79,7 +79,9 @@ func NewLineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { if firstLoc == nil { // Flush if no more data is expected if len(data) != 0 && atEOF && flushAtEOF { - return len(data), data, nil + token = trimWhitespaces(data) + advance = len(data) + return } return 0, nil, nil // read more data and try again. } @@ -89,7 +91,7 @@ func NewLineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { if firstMatchStart != 0 { // the beginning of the file does not match the start pattern, so return a token up to the first match so we don't lose data advance = firstMatchStart - token = data[0:firstMatchStart] + token = trimWhitespaces(data[0:firstMatchStart]) return } @@ -100,7 +102,9 @@ func NewLineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { // Flush if no more data is expected if atEOF && flushAtEOF { - return len(data), data, nil + token = trimWhitespaces(data) + advance = len(data) + return } secondLocOffset := firstMatchEnd + 1 @@ -110,8 +114,8 @@ func NewLineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { } secondMatchStart := secondLoc[0] + secondLocOffset - advance = secondMatchStart // start scanning at the beginning of the second match - token = data[firstMatchStart:secondMatchStart] // the token begins at the first match, and ends at the beginning of the second match + advance = secondMatchStart // start scanning at the beginning of the second match + token = trimWhitespaces(data[firstMatchStart:secondMatchStart]) // the token begins at the first match, and ends at the beginning of the second match err = nil return } @@ -125,7 +129,9 @@ func NewLineEndSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { if loc == nil { // Flush if no more data is expected if len(data) != 0 && atEOF && flushAtEOF { - return len(data), data, nil + token = trimWhitespaces(data) + advance = len(data) + return } return 0, nil, nil // read more data and try again } @@ -137,7 +143,7 @@ func NewLineEndSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc { } advance = loc[1] - token = data[:loc[1]] + token = trimWhitespaces(data[:loc[1]]) err = nil return } @@ -168,7 +174,9 @@ func NewNewlineSplitFunc(encoding encoding.Encoding, flushAtEOF bool) (bufio.Spl // Flush if no more data is expected if atEOF && flushAtEOF { - return len(data), data, nil + token = trimWhitespaces(data) + advance = len(data) + return } // Request more data. @@ -187,3 +195,10 @@ func encodedCarriageReturn(encoding encoding.Encoding) ([]byte, error) { nDst, _, err := encoding.NewEncoder().Transform(out, []byte{'\r'}, true) return out[:nDst], err } + +func trimWhitespaces(data []byte) []byte { + // TrimLeft to strip EOF whitespaces in case of using $ in regex + // For some reason newline and carriage return are being moved to beginning of next log + // TrimRight to strip all whitespaces from the end of log + return bytes.TrimLeft(bytes.TrimRight(data, "\r\n\t "), "\r\n") +} diff --git a/operator/helper/multiline_test.go b/operator/helper/multiline_test.go index 637deb86abfa..b83645875c4c 100644 --- a/operator/helper/multiline_test.go +++ b/operator/helper/multiline_test.go @@ -68,8 +68,8 @@ func TestLineStartSplitFunc(t *testing.T) { Pattern: `LOGSTART \d+ `, Raw: []byte(`LOGSTART 123 log1 LOGSTART 234 log2 LOGSTART 345 foo`), ExpectedTokenized: []string{ - `LOGSTART 123 log1 `, - `LOGSTART 234 log2 `, + `LOGSTART 123 log1`, + `LOGSTART 234 log2`, }, }, { @@ -77,8 +77,8 @@ func TestLineStartSplitFunc(t *testing.T) { Pattern: `^LOGSTART \d+ `, Raw: []byte("LOGSTART 123 LOGSTART 345 log1\nLOGSTART 234 log2\nLOGSTART 345 foo"), ExpectedTokenized: []string{ - "LOGSTART 123 LOGSTART 345 log1\n", - "LOGSTART 234 log2\n", + "LOGSTART 123 LOGSTART 345 log1", + "LOGSTART 234 log2", }, }, { @@ -92,7 +92,7 @@ func TestLineStartSplitFunc(t *testing.T) { Pattern: `LOGSTART \d+ `, Raw: []byte(`part that doesn't match LOGSTART 123 part that matchesLOGSTART 123 foo`), ExpectedTokenized: []string{ - `part that doesn't match `, + `part that doesn't match`, `LOGSTART 123 part that matches`, }, }, @@ -134,6 +134,15 @@ func TestLineStartSplitFunc(t *testing.T) { ExpectedError: errors.New("bufio.Scanner: token too long"), ExpectedTokenized: []string{}, }, + { + Name: "MultipleMultilineLogs", + Pattern: `^LOGSTART \d+`, + Raw: []byte("LOGSTART 12 log1\t \nLOGPART log1\nLOGPART log1\t \nLOGSTART 17 log2\nLOGPART log2\nanother line\nLOGSTART 43 log5"), + ExpectedTokenized: []string{ + "LOGSTART 12 log1\t \nLOGPART log1\nLOGPART log1", + "LOGSTART 17 log2\nLOGPART log2\nanother line", + }, + }, } for _, tc := range testCases { @@ -190,7 +199,7 @@ func TestLineEndSplitFunc(t *testing.T) { Raw: []byte("log1 LOGEND LOGEND\nlog2 LOGEND\n"), ExpectedTokenized: []string{ "log1 LOGEND LOGEND", - "\nlog2 LOGEND", + "log2 LOGEND", }, }, { @@ -242,6 +251,15 @@ func TestLineEndSplitFunc(t *testing.T) { ExpectedTokenized: []string{}, ExpectedError: errors.New("bufio.Scanner: token too long"), }, + { + Name: "MultipleMultilineLogs", + Pattern: `^LOGEND.*$`, + Raw: []byte("LOGSTART 12 log1\t \nLOGPART log1\nLOGEND log1\t \nLOGSTART 17 log2\nLOGPART log2\nLOGEND log2\nLOGSTART 43 log5"), + ExpectedTokenized: []string{ + "LOGSTART 12 log1\t \nLOGPART log1\nLOGEND log1", + "LOGSTART 17 log2\nLOGPART log2\nLOGEND log2", + }, + }, } for _, tc := range testCases {