From 0ed1dd0a60f7c5b66fce570a5f0475ee721b1e24 Mon Sep 17 00:00:00 2001 From: Cole Schlesinger Date: Wed, 19 Oct 2022 18:34:54 -0700 Subject: [PATCH 1/6] Discard control characters when parsing JSON The Go JSON parser rejects JSON strings that contain control characters (U+0000 - U+001F). We'd like to be more permissive. Rather than implementing our own parser, this PR adds a preprocessing step that removes unescaped control characters before invoking Go's JSON parser. --- learn/json_preprocessor.go | 54 +++++++++++++++ learn/json_preprocessor_test.go | 119 ++++++++++++++++++++++++++++++++ learn/parse_http.go | 10 +-- learn/parse_http_test.go | 24 ++++++- 4 files changed, 201 insertions(+), 6 deletions(-) create mode 100644 learn/json_preprocessor.go create mode 100644 learn/json_preprocessor_test.go diff --git a/learn/json_preprocessor.go b/learn/json_preprocessor.go new file mode 100644 index 00000000..eaeb7353 --- /dev/null +++ b/learn/json_preprocessor.go @@ -0,0 +1,54 @@ +package learn + +import ( + "io" + "unicode" +) + +type stripControlCharactersReader struct { + wrapped io.Reader +} + +func newStripControlCharactersReader(wrapped io.Reader) stripControlCharactersReader { + return stripControlCharactersReader{wrapped: wrapped} +} + +// Read up to len(p) bytes, removing any unescaped control characters found. +// Removed characters do not count toward the total bytes read. Returns the +// number of bytes read. +func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { + pIdx := 0 + + // Read up to len(p) bytes, then discard any control characters. Continue + // reading (and discarding control characters) until p is full or there are + // no more bytes to read. + for pIdx < len(p) { + remaining := len(p) - pIdx + buf := make([]byte, remaining) + + var bufN int + bufN, err = r.wrapped.Read(buf) + + // Copy from buf to p, skipping unescaped control characters. + hasPrecedingSlash := false + for _, r := range string(buf) { + if unicode.IsControl(r) && !hasPrecedingSlash { + continue + } + if r == '\\' { + hasPrecedingSlash = !hasPrecedingSlash + } + pIdx += copy(p[pIdx:], string([]rune{r})) + } + + // If we hit an error or read fewer bytes than the size of the buffer, + // don't bother trying to read more from the underlying reader. + if err != nil || bufN < remaining { + break + } + + // Otherwise, we loop to replace CRs we dropped. + } + + return pIdx, err +} diff --git a/learn/json_preprocessor_test.go b/learn/json_preprocessor_test.go new file mode 100644 index 00000000..27973cbd --- /dev/null +++ b/learn/json_preprocessor_test.go @@ -0,0 +1,119 @@ +package learn + +import ( + "io" + "strings" + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestSkipCarriageReturnReader(t *testing.T) { + testCases := []struct { + Name string + Input string + BytesToRead int + DifferentEOFBehavior bool + }{ + { + Name: "empty", + Input: "", + BytesToRead: 128, + }, + { + Name: "read 0 bytes", + Input: "foo", + BytesToRead: 0, + }, + { + Name: "no CR string", + Input: "foo", + BytesToRead: 128, + }, + { + Name: "CR", + Input: "foo\rbar", + BytesToRead: 128, + }, + { + Name: "CRs", + Input: "foo\rbar\r", + BytesToRead: 128, + }, + { + Name: "fill in after removing CRs", + Input: "foo\rbar", + BytesToRead: 4, + }, + { + Name: "CRLFs", + Input: "f\r\noo\r\nbar\r\n", + BytesToRead: 128, + }, + { + Name: "fill in after removing CRLFs", + Input: "f\r\noo\r\nbar\r\n", + BytesToRead: 4, + }, + { + Name: "fill in after removing trailing CRLFs", + Input: "f\r\noo\r\nbar\r\n", + BytesToRead: 7, + + // The strip reader returns an EOF where the strings.Reader + // doesn't, because it tries to read more characters after the + // trailing CRLF and fails. + DifferentEOFBehavior: true, + }, + } + + // For each test case, read bytes from the CR stripper and compare the + // results to reading bytes straight from a strings.Reader and then + // manually removing CRs. + for _, tc := range testCases { + numCRs := countControlCharacters(tc.Input) + + // Read bytes from the CR stripper. + buf := make([]byte, tc.BytesToRead) + n, err := newStripControlCharactersReader(strings.NewReader(tc.Input)).Read(buf) + + // Read bytes straight from strings.Reader. Read extra bytes to make up + // for manually removing CRs afterwards. + expectedBuf := make([]byte, tc.BytesToRead+numCRs) + expectedN, expectedErr := strings.NewReader(tc.Input).Read(expectedBuf) + + // Manually remove CRs from the string read from the strings.Reader. + expectedStr := removeControlCharacters(string(expectedBuf)) + + // Copy the expected string into a BytesToRead-sized buffer, to + // ensure the sizes of the underlying buffers match. + expectedBuf = make([]byte, tc.BytesToRead) + copy(expectedBuf, expectedStr) + + if !(tc.DifferentEOFBehavior && err == io.EOF) { + assert.Equal(t, expectedErr, err, tc.Name+": error") + } + assert.Equal(t, expectedBuf, buf, tc.Name+": string") + assert.Equal(t, expectedN-numCRs, n, tc.Name+": character count") + } +} + +func countControlCharacters(s string) (count int) { + for _, r := range s { + if unicode.IsControl(r) { + count += 1 + } + } + + return count +} + +func removeControlCharacters(s string) string { + return strings.Map(func(r rune) rune { + if unicode.IsControl(r) { + return -1 + } + return r + }, s) +} diff --git a/learn/parse_http.go b/learn/parse_http.go index a660fdbd..67085a26 100644 --- a/learn/parse_http.go +++ b/learn/parse_http.go @@ -6,7 +6,6 @@ import ( "compress/gzip" "encoding/json" "fmt" - "github.com/golang/protobuf/proto" "io" "io/ioutil" "mime" @@ -16,6 +15,8 @@ import ( "sort" "strings" + "github.com/golang/protobuf/proto" + "github.com/andybalholm/brotli" "github.com/google/uuid" "github.com/pkg/errors" @@ -23,13 +24,14 @@ import ( "golang.org/x/text/transform" "gopkg.in/yaml.v2" - "github.com/akitasoftware/akita-cli/printer" - "github.com/akitasoftware/akita-cli/telemetry" pb "github.com/akitasoftware/akita-ir/go/api_spec" "github.com/akitasoftware/akita-libs/akinet" "github.com/akitasoftware/akita-libs/memview" "github.com/akitasoftware/akita-libs/spec_util" "github.com/akitasoftware/akita-libs/spec_util/ir_hash" + + "github.com/akitasoftware/akita-cli/printer" + "github.com/akitasoftware/akita-cli/telemetry" ) var ( @@ -632,7 +634,7 @@ func parseMethodMeta(req *akinet.HTTPRequest) *pb.MethodMeta { func parseHTTPBodyJSON(stream io.Reader) (*pb.Data, error) { var top interface{} - decoder := json.NewDecoder(stream) + decoder := json.NewDecoder(newStripControlCharactersReader(stream)) decoder.UseNumber() err := decoder.Decode(&top) diff --git a/learn/parse_http_test.go b/learn/parse_http_test.go index 2ff117a0..3d10f67a 100644 --- a/learn/parse_http_test.go +++ b/learn/parse_http_test.go @@ -10,7 +10,6 @@ import ( as "github.com/akitasoftware/akita-ir/go/api_spec" "github.com/akitasoftware/akita-libs/akinet" "github.com/akitasoftware/akita-libs/spec_util" - "github.com/spf13/viper" ) @@ -33,6 +32,7 @@ func init() { var testBodyDict = ` { "name": "prince", + "name_with_escaped_CRLF": "prince\\r\\n", "number_teeth": 9000, "dog": true, "canadian_social_insurance_number": "378734493671000", @@ -65,6 +65,7 @@ func newTestBodySpec(statusCode int) *as.Data { func newTestBodySpecContentType(contentType string, statusCode int) *as.Data { return newTestBodySpecFromStruct(statusCode, as.HTTPBody_JSON, contentType, map[string]*as.Data{ "name": dataFromPrimitive(spec_util.NewPrimitiveString("prince")), + "name_with_escaped_CRLF": dataFromPrimitive(spec_util.NewPrimitiveString("prince\\r\\n")), "number_teeth": dataFromPrimitive(spec_util.NewPrimitiveInt64(9000)), "dog": dataFromPrimitive(spec_util.NewPrimitiveBool(true)), "canadian_social_insurance_number": dataFromPrimitive(annotateIfSensitiveForTest(true, spec_util.NewPrimitiveString("378734493671000"))), @@ -590,6 +591,26 @@ func TestFailingParse(t *testing.T) { []*http.Cookie{}, ), }, + { + Name: "json error: invalid '\t\r\n' in string literal", + TestContent: newTestHTTPResponse( + 200, + []byte("{\"field\t\r\n\": 3}"), + "application/json", + map[string][]string{}, + []*http.Cookie{}, + ), + }, + { + Name: "json error: invalid '\t\r\r' not in string literal", + TestContent: newTestHTTPResponse( + 200, + []byte("{\"field\": 3\t\r\n}"), + "application/json", + map[string][]string{}, + []*http.Cookie{}, + ), + }, } for _, tc := range testCases { _, err := ParseHTTP(tc.TestContent) @@ -600,5 +621,4 @@ func TestFailingParse(t *testing.T) { } viper.Set("debug", false) - } From fea612b431db1793c20b496fc9c34c66556efafc Mon Sep 17 00:00:00 2001 From: Cole Schlesinger Date: Wed, 19 Oct 2022 19:51:11 -0700 Subject: [PATCH 2/6] Remove logic for checking escaped control chars --- learn/json_preprocessor.go | 6 +----- learn/parse_http_test.go | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/learn/json_preprocessor.go b/learn/json_preprocessor.go index eaeb7353..f4e15108 100644 --- a/learn/json_preprocessor.go +++ b/learn/json_preprocessor.go @@ -30,14 +30,10 @@ func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { bufN, err = r.wrapped.Read(buf) // Copy from buf to p, skipping unescaped control characters. - hasPrecedingSlash := false for _, r := range string(buf) { - if unicode.IsControl(r) && !hasPrecedingSlash { + if unicode.IsControl(r) { continue } - if r == '\\' { - hasPrecedingSlash = !hasPrecedingSlash - } pIdx += copy(p[pIdx:], string([]rune{r})) } diff --git a/learn/parse_http_test.go b/learn/parse_http_test.go index 3d10f67a..85161225 100644 --- a/learn/parse_http_test.go +++ b/learn/parse_http_test.go @@ -32,7 +32,7 @@ func init() { var testBodyDict = ` { "name": "prince", - "name_with_escaped_CRLF": "prince\\r\\n", + "name_with_escaped_CRLF": "prince\r\n", "number_teeth": 9000, "dog": true, "canadian_social_insurance_number": "378734493671000", @@ -65,7 +65,7 @@ func newTestBodySpec(statusCode int) *as.Data { func newTestBodySpecContentType(contentType string, statusCode int) *as.Data { return newTestBodySpecFromStruct(statusCode, as.HTTPBody_JSON, contentType, map[string]*as.Data{ "name": dataFromPrimitive(spec_util.NewPrimitiveString("prince")), - "name_with_escaped_CRLF": dataFromPrimitive(spec_util.NewPrimitiveString("prince\\r\\n")), + "name_with_escaped_CRLF": dataFromPrimitive(spec_util.NewPrimitiveString("prince\r\n")), "number_teeth": dataFromPrimitive(spec_util.NewPrimitiveInt64(9000)), "dog": dataFromPrimitive(spec_util.NewPrimitiveBool(true)), "canadian_social_insurance_number": dataFromPrimitive(annotateIfSensitiveForTest(true, spec_util.NewPrimitiveString("378734493671000"))), From b3361deaed335f184d152808a5dbb039472298bf Mon Sep 17 00:00:00 2001 From: Cole Schlesinger Date: Wed, 19 Oct 2022 19:55:52 -0700 Subject: [PATCH 3/6] Reuse a fixed buffer --- learn/json_preprocessor.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/learn/json_preprocessor.go b/learn/json_preprocessor.go index f4e15108..fb57bdf7 100644 --- a/learn/json_preprocessor.go +++ b/learn/json_preprocessor.go @@ -18,19 +18,20 @@ func newStripControlCharactersReader(wrapped io.Reader) stripControlCharactersRe // number of bytes read. func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { pIdx := 0 + buf := make([]byte, len(p)) // Read up to len(p) bytes, then discard any control characters. Continue // reading (and discarding control characters) until p is full or there are // no more bytes to read. for pIdx < len(p) { remaining := len(p) - pIdx - buf := make([]byte, remaining) + bufSlice := buf[:remaining] var bufN int - bufN, err = r.wrapped.Read(buf) + bufN, err = r.wrapped.Read(bufSlice) // Copy from buf to p, skipping unescaped control characters. - for _, r := range string(buf) { + for _, r := range string(bufSlice) { if unicode.IsControl(r) { continue } From 451cae0a07f2548db5b0267ff872ef900ef33c88 Mon Sep 17 00:00:00 2001 From: Cole Schlesinger Date: Thu, 20 Oct 2022 09:58:43 -0700 Subject: [PATCH 4/6] Address PR comments and add more unit tests --- learn/json_preprocessor.go | 12 +++---- learn/json_preprocessor_test.go | 56 +++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/learn/json_preprocessor.go b/learn/json_preprocessor.go index fb57bdf7..a58dc7f5 100644 --- a/learn/json_preprocessor.go +++ b/learn/json_preprocessor.go @@ -2,7 +2,6 @@ package learn import ( "io" - "unicode" ) type stripControlCharactersReader struct { @@ -13,7 +12,7 @@ func newStripControlCharactersReader(wrapped io.Reader) stripControlCharactersRe return stripControlCharactersReader{wrapped: wrapped} } -// Read up to len(p) bytes, removing any unescaped control characters found. +// Read up to len(p) bytes, removing any control characters found. // Removed characters do not count toward the total bytes read. Returns the // number of bytes read. func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { @@ -30,12 +29,13 @@ func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { var bufN int bufN, err = r.wrapped.Read(bufSlice) - // Copy from buf to p, skipping unescaped control characters. - for _, r := range string(bufSlice) { - if unicode.IsControl(r) { + // Copy from buf to p, skipping control characters. + for _, c := range bufSlice { + if c < 0x1f { continue } - pIdx += copy(p[pIdx:], string([]rune{r})) + p[pIdx] = c + pIdx += 1 } // If we hit an error or read fewer bytes than the size of the buffer, diff --git a/learn/json_preprocessor_test.go b/learn/json_preprocessor_test.go index 27973cbd..c8a60b70 100644 --- a/learn/json_preprocessor_test.go +++ b/learn/json_preprocessor_test.go @@ -1,6 +1,7 @@ package learn import ( + "encoding/json" "io" "strings" "testing" @@ -99,6 +100,61 @@ func TestSkipCarriageReturnReader(t *testing.T) { } } +func TestSkipControlCharacterReader_JSON(t *testing.T) { + testCases := []struct { + Name string + Input string + Expected interface{} + ExpectedError bool + }{ + { + Name: "no control characters", + Input: `{"foo": "bar"}`, + Expected: map[string]interface{}{"foo": "bar"}, + }, + { + Name: "remove control characters", + Input: "{\"foo\r\n\": \"bar\"}", + Expected: map[string]interface{}{"foo": "bar"}, + }, + { + Name: "escaped control char", + Input: "{\"foo\\\r\\\n\": \"bar\"}", + + // The Go JSON parser doesn't support escaping control characters. + // However, if someone were to try, the preprocessor would remove + // the control character but leave the backslash. + Expected: map[string]interface{}{`foo\`: "bar"}, + }, + { + Name: "newline in string", + Input: `{ + "greeting": "hello \ +", + "subject": "world" +}`, + Expected: map[string]interface{}{"greeting": "hello", "subject": "world"}, + + // After the newline is removed, the JSON parser interprets the backslash + // as escaping the quote. + ExpectedError: true, + }, + } + + for _, tc := range testCases { + // Parse JSON after removing control strings. + var parsed interface{} + decoder := json.NewDecoder(newStripControlCharactersReader(strings.NewReader(tc.Input))) + err := decoder.Decode(&parsed) + if tc.ExpectedError { + assert.Error(t, err, tc.Name) + } else { + assert.NoError(t, err, tc.Name) + assert.Equal(t, tc.Expected, parsed, tc.Name) + } + } +} + func countControlCharacters(s string) (count int) { for _, r := range s { if unicode.IsControl(r) { From 9a4ebf0c915a4784e51f98cc6d62c8df5427f4a2 Mon Sep 17 00:00:00 2001 From: Cole <63367934+thatplguy@users.noreply.github.com> Date: Thu, 20 Oct 2022 10:23:36 -0700 Subject: [PATCH 5/6] Update learn/json_preprocessor.go Co-authored-by: Jed Liu --- learn/json_preprocessor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learn/json_preprocessor.go b/learn/json_preprocessor.go index a58dc7f5..f12444d6 100644 --- a/learn/json_preprocessor.go +++ b/learn/json_preprocessor.go @@ -31,7 +31,7 @@ func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { // Copy from buf to p, skipping control characters. for _, c := range bufSlice { - if c < 0x1f { + if c <= 0x1f { continue } p[pIdx] = c From cbf9b0e94a9404dee2f3e36c0b6e705c4c8ca73b Mon Sep 17 00:00:00 2001 From: Cole <63367934+thatplguy@users.noreply.github.com> Date: Thu, 20 Oct 2022 15:59:37 -0700 Subject: [PATCH 6/6] Update learn/json_preprocessor.go Co-authored-by: Jed Liu --- learn/json_preprocessor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learn/json_preprocessor.go b/learn/json_preprocessor.go index f12444d6..9b137b10 100644 --- a/learn/json_preprocessor.go +++ b/learn/json_preprocessor.go @@ -30,7 +30,7 @@ func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { bufN, err = r.wrapped.Read(bufSlice) // Copy from buf to p, skipping control characters. - for _, c := range bufSlice { + for _, c := range bufSlice[:bufN] { if c <= 0x1f { continue }