diff --git a/learn/json_preprocessor.go b/learn/json_preprocessor.go new file mode 100644 index 00000000..9b137b10 --- /dev/null +++ b/learn/json_preprocessor.go @@ -0,0 +1,51 @@ +package learn + +import ( + "io" +) + +type stripControlCharactersReader struct { + wrapped io.Reader +} + +func newStripControlCharactersReader(wrapped io.Reader) stripControlCharactersReader { + return stripControlCharactersReader{wrapped: wrapped} +} + +// Read up to len(p) bytes, removing any control characters found. +// Removed characters do not count toward the total bytes read. Returns the +// number of bytes read. +func (r stripControlCharactersReader) Read(p []byte) (n int, err error) { + pIdx := 0 + buf := make([]byte, len(p)) + + // Read up to len(p) bytes, then discard any control characters. Continue + // reading (and discarding control characters) until p is full or there are + // no more bytes to read. + for pIdx < len(p) { + remaining := len(p) - pIdx + bufSlice := buf[:remaining] + + var bufN int + bufN, err = r.wrapped.Read(bufSlice) + + // Copy from buf to p, skipping control characters. + for _, c := range bufSlice[:bufN] { + if c <= 0x1f { + continue + } + p[pIdx] = c + pIdx += 1 + } + + // If we hit an error or read fewer bytes than the size of the buffer, + // don't bother trying to read more from the underlying reader. + if err != nil || bufN < remaining { + break + } + + // Otherwise, we loop to replace CRs we dropped. + } + + return pIdx, err +} diff --git a/learn/json_preprocessor_test.go b/learn/json_preprocessor_test.go new file mode 100644 index 00000000..c8a60b70 --- /dev/null +++ b/learn/json_preprocessor_test.go @@ -0,0 +1,175 @@ +package learn + +import ( + "encoding/json" + "io" + "strings" + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestSkipCarriageReturnReader(t *testing.T) { + testCases := []struct { + Name string + Input string + BytesToRead int + DifferentEOFBehavior bool + }{ + { + Name: "empty", + Input: "", + BytesToRead: 128, + }, + { + Name: "read 0 bytes", + Input: "foo", + BytesToRead: 0, + }, + { + Name: "no CR string", + Input: "foo", + BytesToRead: 128, + }, + { + Name: "CR", + Input: "foo\rbar", + BytesToRead: 128, + }, + { + Name: "CRs", + Input: "foo\rbar\r", + BytesToRead: 128, + }, + { + Name: "fill in after removing CRs", + Input: "foo\rbar", + BytesToRead: 4, + }, + { + Name: "CRLFs", + Input: "f\r\noo\r\nbar\r\n", + BytesToRead: 128, + }, + { + Name: "fill in after removing CRLFs", + Input: "f\r\noo\r\nbar\r\n", + BytesToRead: 4, + }, + { + Name: "fill in after removing trailing CRLFs", + Input: "f\r\noo\r\nbar\r\n", + BytesToRead: 7, + + // The strip reader returns an EOF where the strings.Reader + // doesn't, because it tries to read more characters after the + // trailing CRLF and fails. + DifferentEOFBehavior: true, + }, + } + + // For each test case, read bytes from the CR stripper and compare the + // results to reading bytes straight from a strings.Reader and then + // manually removing CRs. + for _, tc := range testCases { + numCRs := countControlCharacters(tc.Input) + + // Read bytes from the CR stripper. + buf := make([]byte, tc.BytesToRead) + n, err := newStripControlCharactersReader(strings.NewReader(tc.Input)).Read(buf) + + // Read bytes straight from strings.Reader. Read extra bytes to make up + // for manually removing CRs afterwards. + expectedBuf := make([]byte, tc.BytesToRead+numCRs) + expectedN, expectedErr := strings.NewReader(tc.Input).Read(expectedBuf) + + // Manually remove CRs from the string read from the strings.Reader. + expectedStr := removeControlCharacters(string(expectedBuf)) + + // Copy the expected string into a BytesToRead-sized buffer, to + // ensure the sizes of the underlying buffers match. + expectedBuf = make([]byte, tc.BytesToRead) + copy(expectedBuf, expectedStr) + + if !(tc.DifferentEOFBehavior && err == io.EOF) { + assert.Equal(t, expectedErr, err, tc.Name+": error") + } + assert.Equal(t, expectedBuf, buf, tc.Name+": string") + assert.Equal(t, expectedN-numCRs, n, tc.Name+": character count") + } +} + +func TestSkipControlCharacterReader_JSON(t *testing.T) { + testCases := []struct { + Name string + Input string + Expected interface{} + ExpectedError bool + }{ + { + Name: "no control characters", + Input: `{"foo": "bar"}`, + Expected: map[string]interface{}{"foo": "bar"}, + }, + { + Name: "remove control characters", + Input: "{\"foo\r\n\": \"bar\"}", + Expected: map[string]interface{}{"foo": "bar"}, + }, + { + Name: "escaped control char", + Input: "{\"foo\\\r\\\n\": \"bar\"}", + + // The Go JSON parser doesn't support escaping control characters. + // However, if someone were to try, the preprocessor would remove + // the control character but leave the backslash. + Expected: map[string]interface{}{`foo\`: "bar"}, + }, + { + Name: "newline in string", + Input: `{ + "greeting": "hello \ +", + "subject": "world" +}`, + Expected: map[string]interface{}{"greeting": "hello", "subject": "world"}, + + // After the newline is removed, the JSON parser interprets the backslash + // as escaping the quote. + ExpectedError: true, + }, + } + + for _, tc := range testCases { + // Parse JSON after removing control strings. + var parsed interface{} + decoder := json.NewDecoder(newStripControlCharactersReader(strings.NewReader(tc.Input))) + err := decoder.Decode(&parsed) + if tc.ExpectedError { + assert.Error(t, err, tc.Name) + } else { + assert.NoError(t, err, tc.Name) + assert.Equal(t, tc.Expected, parsed, tc.Name) + } + } +} + +func countControlCharacters(s string) (count int) { + for _, r := range s { + if unicode.IsControl(r) { + count += 1 + } + } + + return count +} + +func removeControlCharacters(s string) string { + return strings.Map(func(r rune) rune { + if unicode.IsControl(r) { + return -1 + } + return r + }, s) +} diff --git a/learn/parse_http.go b/learn/parse_http.go index a660fdbd..67085a26 100644 --- a/learn/parse_http.go +++ b/learn/parse_http.go @@ -6,7 +6,6 @@ import ( "compress/gzip" "encoding/json" "fmt" - "github.com/golang/protobuf/proto" "io" "io/ioutil" "mime" @@ -16,6 +15,8 @@ import ( "sort" "strings" + "github.com/golang/protobuf/proto" + "github.com/andybalholm/brotli" "github.com/google/uuid" "github.com/pkg/errors" @@ -23,13 +24,14 @@ import ( "golang.org/x/text/transform" "gopkg.in/yaml.v2" - "github.com/akitasoftware/akita-cli/printer" - "github.com/akitasoftware/akita-cli/telemetry" pb "github.com/akitasoftware/akita-ir/go/api_spec" "github.com/akitasoftware/akita-libs/akinet" "github.com/akitasoftware/akita-libs/memview" "github.com/akitasoftware/akita-libs/spec_util" "github.com/akitasoftware/akita-libs/spec_util/ir_hash" + + "github.com/akitasoftware/akita-cli/printer" + "github.com/akitasoftware/akita-cli/telemetry" ) var ( @@ -632,7 +634,7 @@ func parseMethodMeta(req *akinet.HTTPRequest) *pb.MethodMeta { func parseHTTPBodyJSON(stream io.Reader) (*pb.Data, error) { var top interface{} - decoder := json.NewDecoder(stream) + decoder := json.NewDecoder(newStripControlCharactersReader(stream)) decoder.UseNumber() err := decoder.Decode(&top) diff --git a/learn/parse_http_test.go b/learn/parse_http_test.go index 2ff117a0..85161225 100644 --- a/learn/parse_http_test.go +++ b/learn/parse_http_test.go @@ -10,7 +10,6 @@ import ( as "github.com/akitasoftware/akita-ir/go/api_spec" "github.com/akitasoftware/akita-libs/akinet" "github.com/akitasoftware/akita-libs/spec_util" - "github.com/spf13/viper" ) @@ -33,6 +32,7 @@ func init() { var testBodyDict = ` { "name": "prince", + "name_with_escaped_CRLF": "prince\r\n", "number_teeth": 9000, "dog": true, "canadian_social_insurance_number": "378734493671000", @@ -65,6 +65,7 @@ func newTestBodySpec(statusCode int) *as.Data { func newTestBodySpecContentType(contentType string, statusCode int) *as.Data { return newTestBodySpecFromStruct(statusCode, as.HTTPBody_JSON, contentType, map[string]*as.Data{ "name": dataFromPrimitive(spec_util.NewPrimitiveString("prince")), + "name_with_escaped_CRLF": dataFromPrimitive(spec_util.NewPrimitiveString("prince\r\n")), "number_teeth": dataFromPrimitive(spec_util.NewPrimitiveInt64(9000)), "dog": dataFromPrimitive(spec_util.NewPrimitiveBool(true)), "canadian_social_insurance_number": dataFromPrimitive(annotateIfSensitiveForTest(true, spec_util.NewPrimitiveString("378734493671000"))), @@ -590,6 +591,26 @@ func TestFailingParse(t *testing.T) { []*http.Cookie{}, ), }, + { + Name: "json error: invalid '\t\r\n' in string literal", + TestContent: newTestHTTPResponse( + 200, + []byte("{\"field\t\r\n\": 3}"), + "application/json", + map[string][]string{}, + []*http.Cookie{}, + ), + }, + { + Name: "json error: invalid '\t\r\r' not in string literal", + TestContent: newTestHTTPResponse( + 200, + []byte("{\"field\": 3\t\r\n}"), + "application/json", + map[string][]string{}, + []*http.Cookie{}, + ), + }, } for _, tc := range testCases { _, err := ParseHTTP(tc.TestContent) @@ -600,5 +621,4 @@ func TestFailingParse(t *testing.T) { } viper.Set("debug", false) - }