Skip to content
This repository has been archived by the owner on Jul 10, 2024. It is now read-only.

Commit

Permalink
Replace '\C' with '\\' instead of '\' (#166)
Browse files Browse the repository at this point in the history
When preprocessing JSON to remove control characters, replace escaped control
characters with a backslash.  This ensures the JSON parser will interpret the
escaped control character as an escaped backslash, rather than escaping the
next character after the control character.
  • Loading branch information
thatplguy authored Oct 21, 2022
1 parent 6546a0d commit cf01e45
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 25 deletions.
38 changes: 30 additions & 8 deletions learn/json_preprocessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,23 @@ import (
)

type stripControlCharactersReader struct {
wrapped io.Reader
hasPrecedingBackslash bool
wrapped io.Reader
}

func newStripControlCharactersReader(wrapped io.Reader) stripControlCharactersReader {
return stripControlCharactersReader{wrapped: wrapped}
func newStripControlCharactersReader(wrapped io.Reader) *stripControlCharactersReader {
return &stripControlCharactersReader{wrapped: wrapped}
}

// Read up to len(p) bytes, removing any control characters found.
// Removed characters do not count toward the total bytes read. Returns the
// number of bytes read.
func (r stripControlCharactersReader) Read(p []byte) (n int, err error) {
// Removed characters do not count toward the total bytes read.
//
// If the control character is preceded by a backslash, it is replaced with a
// backslash rather than removed, e.g. "\<CR>" becomes "\\". This prevents
// the JSON parser from applying the backslash to escape the next character.
//
// Returns the number of bytes read.
func (r *stripControlCharactersReader) Read(p []byte) (n int, err error) {
pIdx := 0
buf := make([]byte, len(p))

Expand All @@ -31,10 +37,26 @@ func (r stripControlCharactersReader) Read(p []byte) (n int, err error) {

// Copy from buf to p, skipping control characters.
for _, c := range bufSlice[:bufN] {
toWrite := c

if c <= 0x1f {
continue
if r.hasPrecedingBackslash {
// If the control character is escaped, replace it with a
// backslash.
toWrite = '\\'
} else {
// Otherwise, just remove it.
continue
}
}

if c == '\\' {
r.hasPrecedingBackslash = !r.hasPrecedingBackslash
} else {
r.hasPrecedingBackslash = false
}
p[pIdx] = c

p[pIdx] = toWrite
pIdx += 1
}

Expand Down
57 changes: 40 additions & 17 deletions learn/json_preprocessor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,9 @@ func TestSkipCarriageReturnReader(t *testing.T) {

func TestSkipControlCharacterReader_JSON(t *testing.T) {
testCases := []struct {
Name string
Input string
Expected interface{}
ExpectedError bool
Name string
Input string
Expected interface{}
}{
{
Name: "no control characters",
Expand All @@ -119,11 +118,11 @@ func TestSkipControlCharacterReader_JSON(t *testing.T) {
},
{
Name: "escaped control char",
Input: "{\"foo\\\r\\\n\": \"bar\"}",
Input: "{\"foo\\\r\": \"bar\"}",

// The Go JSON parser doesn't support escaping control characters.
// However, if someone were to try, the preprocessor would remove
// the control character but leave the backslash.
// the control character and replace it with a backslash.
Expected: map[string]interface{}{`foo\`: "bar"},
},
{
Expand All @@ -133,25 +132,49 @@ func TestSkipControlCharacterReader_JSON(t *testing.T) {
",
"subject": "world"
}`,
Expected: map[string]interface{}{"greeting": "hello", "subject": "world"},

// After the newline is removed, the JSON parser interprets the backslash
// as escaping the quote.
ExpectedError: true,
Expected: map[string]interface{}{"greeting": "hello \\", "subject": "world"},
},
}

for _, tc := range testCases {
// Parse JSON after removing control strings.
var parsed interface{}
var err error

// Try parsing using a JSON decoder.
decoder := json.NewDecoder(newStripControlCharactersReader(strings.NewReader(tc.Input)))
err := decoder.Decode(&parsed)
if tc.ExpectedError {
assert.Error(t, err, tc.Name)
} else {
assert.NoError(t, err, tc.Name)
assert.Equal(t, tc.Expected, parsed, tc.Name)
err = decoder.Decode(&parsed)
assert.NoError(t, err, "["+tc.Name+", decoder] error")
assert.Equal(t, tc.Expected, parsed, "["+tc.Name+", decoder] not equal")

// Try reading all bytes from the reader, one byte at a time, and then
// parsing.
bytesToRead := 1
reader := newStripControlCharactersReader(strings.NewReader(tc.Input))
read := make([]byte, 0, len([]byte(tc.Input)))
buf := make([]byte, bytesToRead)
for {
var n int
n, err = reader.Read(buf)

read = append(read, buf[:n]...)

if err != nil {
break
}

// Clear the buffer.
for i, _ := range buf {
buf[i] = '\x00'
}
}

// Parse.
if err == io.EOF {
err = json.Unmarshal(read, &parsed)
}
assert.NoError(t, err, "["+tc.Name+", byte-by-byte] error")
assert.Equal(t, tc.Expected, parsed, "["+tc.Name+", byte-by-byte] not equal")
}
}

Expand Down

0 comments on commit cf01e45

Please sign in to comment.