From 70568520ab10be616f1ca7abaf6d1de11842c4ce Mon Sep 17 00:00:00 2001 From: Cole Schlesinger Date: Fri, 21 Oct 2022 12:20:11 -0700 Subject: [PATCH] Replace '\C' with '\\' instead of '\' When preprocessing JSON to remove control characters, replace escaped control characters with a backslash. This ensures the JSON parser will interpret the escaped control character as an escaped backslash, rather than escaping the next character after the control character. --- learn/json_preprocessor.go | 38 +++++++++++++++++----- learn/json_preprocessor_test.go | 57 +++++++++++++++++++++++---------- 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/learn/json_preprocessor.go b/learn/json_preprocessor.go index 9b137b10..737fff16 100644 --- a/learn/json_preprocessor.go +++ b/learn/json_preprocessor.go @@ -5,17 +5,23 @@ import ( ) type stripControlCharactersReader struct { - wrapped io.Reader + hasPrecedingBackslash bool + wrapped io.Reader } -func newStripControlCharactersReader(wrapped io.Reader) stripControlCharactersReader { - return stripControlCharactersReader{wrapped: wrapped} +func newStripControlCharactersReader(wrapped io.Reader) *stripControlCharactersReader { + return &stripControlCharactersReader{wrapped: wrapped} } // Read up to len(p) bytes, removing any control characters found. -// Removed characters do not count toward the total bytes read. Returns the -// number of bytes read. -func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { +// Removed characters do not count toward the total bytes read. +// +// If the control character is preceded by a backslash, it is replaced with a +// backslash rather than removed, e.g. "\" becomes "\\". This prevents +// the JSON parser from applying the backslash to escape the next character. +// +// Returns the number of bytes read. +func (r *stripControlCharactersReader) Read(p []byte) (n int, err error) { pIdx := 0 buf := make([]byte, len(p)) @@ -31,10 +37,26 @@ func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { // Copy from buf to p, skipping control characters. for _, c := range bufSlice[:bufN] { + toWrite := c + if c <= 0x1f { - continue + if r.hasPrecedingBackslash { + // If the control character is escaped, replace it with a + // backslash. + toWrite = '\\' + } else { + // Otherwise, just remove it. + continue + } + } + + if c == '\\' { + r.hasPrecedingBackslash = !r.hasPrecedingBackslash + } else { + r.hasPrecedingBackslash = false } - p[pIdx] = c + + p[pIdx] = toWrite pIdx += 1 } diff --git a/learn/json_preprocessor_test.go b/learn/json_preprocessor_test.go index c8a60b70..449ed8e0 100644 --- a/learn/json_preprocessor_test.go +++ b/learn/json_preprocessor_test.go @@ -102,10 +102,9 @@ func TestSkipCarriageReturnReader(t *testing.T) { func TestSkipControlCharacterReader_JSON(t *testing.T) { testCases := []struct { - Name string - Input string - Expected interface{} - ExpectedError bool + Name string + Input string + Expected interface{} }{ { Name: "no control characters", @@ -119,11 +118,11 @@ func TestSkipControlCharacterReader_JSON(t *testing.T) { }, { Name: "escaped control char", - Input: "{\"foo\\\r\\\n\": \"bar\"}", + Input: "{\"foo\\\r\": \"bar\"}", // The Go JSON parser doesn't support escaping control characters. // However, if someone were to try, the preprocessor would remove - // the control character but leave the backslash. + // the control character and replace it with a backslash. Expected: map[string]interface{}{`foo\`: "bar"}, }, { @@ -133,25 +132,49 @@ func TestSkipControlCharacterReader_JSON(t *testing.T) { ", "subject": "world" }`, - Expected: map[string]interface{}{"greeting": "hello", "subject": "world"}, - - // After the newline is removed, the JSON parser interprets the backslash - // as escaping the quote. - ExpectedError: true, + Expected: map[string]interface{}{"greeting": "hello \\", "subject": "world"}, }, } for _, tc := range testCases { // Parse JSON after removing control strings. var parsed interface{} + var err error + + // Try parsing using a JSON decoder. decoder := json.NewDecoder(newStripControlCharactersReader(strings.NewReader(tc.Input))) - err := decoder.Decode(&parsed) - if tc.ExpectedError { - assert.Error(t, err, tc.Name) - } else { - assert.NoError(t, err, tc.Name) - assert.Equal(t, tc.Expected, parsed, tc.Name) + err = decoder.Decode(&parsed) + assert.NoError(t, err, "["+tc.Name+", decoder] error") + assert.Equal(t, tc.Expected, parsed, "["+tc.Name+", decoder] not equal") + + // Try reading all bytes from the reader, one byte at a time, and then + // parsing. + bytesToRead := 1 + reader := newStripControlCharactersReader(strings.NewReader(tc.Input)) + read := make([]byte, 0, len([]byte(tc.Input))) + buf := make([]byte, bytesToRead) + for { + var n int + n, err = reader.Read(buf) + + read = append(read, buf[:n]...) + + if err != nil { + break + } + + // Clear the buffer. + for i, _ := range buf { + buf[i] = '\x00' + } + } + + // Parse. + if err == io.EOF { + err = json.Unmarshal(read, &parsed) } + assert.NoError(t, err, "["+tc.Name+", byte-by-byte] error") + assert.Equal(t, tc.Expected, parsed, "["+tc.Name+", byte-by-byte] not equal") } }