feat: trim whitechars for file with multiline (#212)

* feat: strip whitechars in multiline helper Signed-off-by: Dominik Rosiek <[email protected]> * fix(multiline): use proper advance and strip new line whitespaces from beginning of log Signed-off-by: Dominik Rosiek <[email protected]> * test(multiline): add multiple multiline logs case Signed-off-by: Dominik Rosiek <[email protected]>
djaglowski · Jul 9, 2021 · 2ffe032 · 2ffe032
1 parent 3f1d661
commit 2ffe032
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 14 deletions.
diff --git a/operator/helper/multiline.go b/operator/helper/multiline.go
@@ -79,7 +79,9 @@ func NewLineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc {
 		if firstLoc == nil {
 			// Flush if no more data is expected
 			if len(data) != 0 && atEOF && flushAtEOF {
-				return len(data), data, nil
+				token = trimWhitespaces(data)
+				advance = len(data)
+				return
 			}
 			return 0, nil, nil // read more data and try again.
 		}
@@ -89,7 +91,7 @@ func NewLineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc {
 		if firstMatchStart != 0 {
 			// the beginning of the file does not match the start pattern, so return a token up to the first match so we don't lose data
 			advance = firstMatchStart
-			token = data[0:firstMatchStart]
+			token = trimWhitespaces(data[0:firstMatchStart])
 			return
 		}
 
@@ -100,7 +102,9 @@ func NewLineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc {
 
 		// Flush if no more data is expected
 		if atEOF && flushAtEOF {
-			return len(data), data, nil
+			token = trimWhitespaces(data)
+			advance = len(data)
+			return
 		}
 
 		secondLocOffset := firstMatchEnd + 1
@@ -110,8 +114,8 @@ func NewLineStartSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc {
 		}
 		secondMatchStart := secondLoc[0] + secondLocOffset
 
-		advance = secondMatchStart                     // start scanning at the beginning of the second match
-		token = data[firstMatchStart:secondMatchStart] // the token begins at the first match, and ends at the beginning of the second match
+		advance = secondMatchStart                                      // start scanning at the beginning of the second match
+		token = trimWhitespaces(data[firstMatchStart:secondMatchStart]) // the token begins at the first match, and ends at the beginning of the second match
 		err = nil
 		return
 	}
@@ -125,7 +129,9 @@ func NewLineEndSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc {
 		if loc == nil {
 			// Flush if no more data is expected
 			if len(data) != 0 && atEOF && flushAtEOF {
-				return len(data), data, nil
+				token = trimWhitespaces(data)
+				advance = len(data)
+				return
 			}
 			return 0, nil, nil // read more data and try again
 		}
@@ -137,7 +143,7 @@ func NewLineEndSplitFunc(re *regexp.Regexp, flushAtEOF bool) bufio.SplitFunc {
 		}
 
 		advance = loc[1]
-		token = data[:loc[1]]
+		token = trimWhitespaces(data[:loc[1]])
 		err = nil
 		return
 	}
@@ -168,7 +174,9 @@ func NewNewlineSplitFunc(encoding encoding.Encoding, flushAtEOF bool) (bufio.Spl
 
 		// Flush if no more data is expected
 		if atEOF && flushAtEOF {
-			return len(data), data, nil
+			token = trimWhitespaces(data)
+			advance = len(data)
+			return
 		}
 
 		// Request more data.
@@ -187,3 +195,10 @@ func encodedCarriageReturn(encoding encoding.Encoding) ([]byte, error) {
 	nDst, _, err := encoding.NewEncoder().Transform(out, []byte{'\r'}, true)
 	return out[:nDst], err
 }
+
+func trimWhitespaces(data []byte) []byte {
+	// TrimLeft to strip EOF whitespaces in case of using $ in regex
+	// For some reason newline and carriage return are being moved to beginning of next log
+	// TrimRight to strip all whitespaces from the end of log
+	return bytes.TrimLeft(bytes.TrimRight(data, "\r\n\t "), "\r\n")
+}
diff --git a/operator/helper/multiline_test.go b/operator/helper/multiline_test.go
@@ -68,17 +68,17 @@ func TestLineStartSplitFunc(t *testing.T) {
 			Pattern: `LOGSTART \d+ `,
 			Raw:     []byte(`LOGSTART 123 log1 LOGSTART 234 log2 LOGSTART 345 foo`),
 			ExpectedTokenized: []string{
-				`LOGSTART 123 log1 `,
-				`LOGSTART 234 log2 `,
+				`LOGSTART 123 log1`,
+				`LOGSTART 234 log2`,
 			},
 		},
 		{
 			Name:    "TwoLogsLineStart",
 			Pattern: `^LOGSTART \d+ `,
 			Raw:     []byte("LOGSTART 123 LOGSTART 345 log1\nLOGSTART 234 log2\nLOGSTART 345 foo"),
 			ExpectedTokenized: []string{
-				"LOGSTART 123 LOGSTART 345 log1\n",
-				"LOGSTART 234 log2\n",
+				"LOGSTART 123 LOGSTART 345 log1",
+				"LOGSTART 234 log2",
 			},
 		},
 		{
@@ -92,7 +92,7 @@ func TestLineStartSplitFunc(t *testing.T) {
 			Pattern: `LOGSTART \d+ `,
 			Raw:     []byte(`part that doesn't match LOGSTART 123 part that matchesLOGSTART 123 foo`),
 			ExpectedTokenized: []string{
-				`part that doesn't match `,
+				`part that doesn't match`,
 				`LOGSTART 123 part that matches`,
 			},
 		},
@@ -134,6 +134,15 @@ func TestLineStartSplitFunc(t *testing.T) {
 			ExpectedError:     errors.New("bufio.Scanner: token too long"),
 			ExpectedTokenized: []string{},
 		},
+		{
+			Name:    "MultipleMultilineLogs",
+			Pattern: `^LOGSTART \d+`,
+			Raw:     []byte("LOGSTART 12 log1\t  \nLOGPART log1\nLOGPART log1\t   \nLOGSTART 17 log2\nLOGPART log2\nanother line\nLOGSTART 43 log5"),
+			ExpectedTokenized: []string{
+				"LOGSTART 12 log1\t  \nLOGPART log1\nLOGPART log1",
+				"LOGSTART 17 log2\nLOGPART log2\nanother line",
+			},
+		},
 	}
 
 	for _, tc := range testCases {
@@ -190,7 +199,7 @@ func TestLineEndSplitFunc(t *testing.T) {
 			Raw:     []byte("log1 LOGEND LOGEND\nlog2 LOGEND\n"),
 			ExpectedTokenized: []string{
 				"log1 LOGEND LOGEND",
-				"\nlog2 LOGEND",
+				"log2 LOGEND",
 			},
 		},
 		{
@@ -242,6 +251,15 @@ func TestLineEndSplitFunc(t *testing.T) {
 			ExpectedTokenized: []string{},
 			ExpectedError:     errors.New("bufio.Scanner: token too long"),
 		},
+		{
+			Name:    "MultipleMultilineLogs",
+			Pattern: `^LOGEND.*$`,
+			Raw:     []byte("LOGSTART 12 log1\t  \nLOGPART log1\nLOGEND log1\t   \nLOGSTART 17 log2\nLOGPART log2\nLOGEND log2\nLOGSTART 43 log5"),
+			ExpectedTokenized: []string{
+				"LOGSTART 12 log1\t  \nLOGPART log1\nLOGEND log1",
+				"LOGSTART 17 log2\nLOGPART log2\nLOGEND log2",
+			},
+		},
 	}
 
 	for _, tc := range testCases {