Skip to content
This repository has been archived by the owner on Jul 10, 2024. It is now read-only.

Discard control characters when parsing JSON #165

Merged
merged 6 commits into from
Oct 21, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions learn/json_preprocessor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package learn

import (
"io"
"unicode"
)

type stripControlCharactersReader struct {
wrapped io.Reader
}

func newStripControlCharactersReader(wrapped io.Reader) stripControlCharactersReader {
return stripControlCharactersReader{wrapped: wrapped}
}

// Read up to len(p) bytes, removing any unescaped control characters found.
thatplguy marked this conversation as resolved.
Show resolved Hide resolved
// Removed characters do not count toward the total bytes read. Returns the
// number of bytes read.
func (r stripControlCharactersReader) Read(p []byte) (n int, err error) {
pIdx := 0
buf := make([]byte, len(p))

// Read up to len(p) bytes, then discard any control characters. Continue
// reading (and discarding control characters) until p is full or there are
// no more bytes to read.
for pIdx < len(p) {
remaining := len(p) - pIdx
bufSlice := buf[:remaining]

var bufN int
bufN, err = r.wrapped.Read(bufSlice)

// Copy from buf to p, skipping unescaped control characters.
thatplguy marked this conversation as resolved.
Show resolved Hide resolved
for _, r := range string(bufSlice) {
if unicode.IsControl(r) {
continue
}
thatplguy marked this conversation as resolved.
Show resolved Hide resolved
pIdx += copy(p[pIdx:], string([]rune{r}))
liujed marked this conversation as resolved.
Show resolved Hide resolved
}

// If we hit an error or read fewer bytes than the size of the buffer,
// don't bother trying to read more from the underlying reader.
if err != nil || bufN < remaining {
break
}
liujed marked this conversation as resolved.
Show resolved Hide resolved

// Otherwise, we loop to replace CRs we dropped.
}

return pIdx, err
}
119 changes: 119 additions & 0 deletions learn/json_preprocessor_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package learn

import (
"io"
"strings"
"testing"
"unicode"

"github.com/stretchr/testify/assert"
)

func TestSkipCarriageReturnReader(t *testing.T) {
testCases := []struct {
Name string
Input string
BytesToRead int
DifferentEOFBehavior bool
}{
{
Name: "empty",
Input: "",
BytesToRead: 128,
},
{
Name: "read 0 bytes",
Input: "foo",
BytesToRead: 0,
},
{
Name: "no CR string",
Input: "foo",
BytesToRead: 128,
},
{
Name: "CR",
Input: "foo\rbar",
BytesToRead: 128,
},
{
Name: "CRs",
Input: "foo\rbar\r",
BytesToRead: 128,
},
{
Name: "fill in after removing CRs",
Input: "foo\rbar",
BytesToRead: 4,
},
{
Name: "CRLFs",
Input: "f\r\noo\r\nbar\r\n",
BytesToRead: 128,
},
{
Name: "fill in after removing CRLFs",
Input: "f\r\noo\r\nbar\r\n",
BytesToRead: 4,
},
{
Name: "fill in after removing trailing CRLFs",
Input: "f\r\noo\r\nbar\r\n",
BytesToRead: 7,

// The strip reader returns an EOF where the strings.Reader
// doesn't, because it tries to read more characters after the
// trailing CRLF and fails.
DifferentEOFBehavior: true,
},
}

// For each test case, read bytes from the CR stripper and compare the
// results to reading bytes straight from a strings.Reader and then
// manually removing CRs.
for _, tc := range testCases {
numCRs := countControlCharacters(tc.Input)

// Read bytes from the CR stripper.
buf := make([]byte, tc.BytesToRead)
n, err := newStripControlCharactersReader(strings.NewReader(tc.Input)).Read(buf)

// Read bytes straight from strings.Reader. Read extra bytes to make up
// for manually removing CRs afterwards.
expectedBuf := make([]byte, tc.BytesToRead+numCRs)
expectedN, expectedErr := strings.NewReader(tc.Input).Read(expectedBuf)

// Manually remove CRs from the string read from the strings.Reader.
expectedStr := removeControlCharacters(string(expectedBuf))

// Copy the expected string into a BytesToRead-sized buffer, to
// ensure the sizes of the underlying buffers match.
expectedBuf = make([]byte, tc.BytesToRead)
copy(expectedBuf, expectedStr)

if !(tc.DifferentEOFBehavior && err == io.EOF) {
assert.Equal(t, expectedErr, err, tc.Name+": error")
}
assert.Equal(t, expectedBuf, buf, tc.Name+": string")
assert.Equal(t, expectedN-numCRs, n, tc.Name+": character count")
}
}

func countControlCharacters(s string) (count int) {
for _, r := range s {
if unicode.IsControl(r) {
count += 1
}
}

return count
}

func removeControlCharacters(s string) string {
return strings.Map(func(r rune) rune {
if unicode.IsControl(r) {
return -1
}
return r
}, s)
}
10 changes: 6 additions & 4 deletions learn/parse_http.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"compress/gzip"
"encoding/json"
"fmt"
"github.com/golang/protobuf/proto"
"io"
"io/ioutil"
"mime"
Expand All @@ -16,20 +15,23 @@ import (
"sort"
"strings"

"github.com/golang/protobuf/proto"

"github.com/andybalholm/brotli"
"github.com/google/uuid"
"github.com/pkg/errors"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/transform"
"gopkg.in/yaml.v2"

"github.com/akitasoftware/akita-cli/printer"
"github.com/akitasoftware/akita-cli/telemetry"
pb "github.com/akitasoftware/akita-ir/go/api_spec"
"github.com/akitasoftware/akita-libs/akinet"
"github.com/akitasoftware/akita-libs/memview"
"github.com/akitasoftware/akita-libs/spec_util"
"github.com/akitasoftware/akita-libs/spec_util/ir_hash"

"github.com/akitasoftware/akita-cli/printer"
"github.com/akitasoftware/akita-cli/telemetry"
)

var (
Expand Down Expand Up @@ -632,7 +634,7 @@ func parseMethodMeta(req *akinet.HTTPRequest) *pb.MethodMeta {

func parseHTTPBodyJSON(stream io.Reader) (*pb.Data, error) {
var top interface{}
decoder := json.NewDecoder(stream)
decoder := json.NewDecoder(newStripControlCharactersReader(stream))
decoder.UseNumber()

err := decoder.Decode(&top)
Expand Down
24 changes: 22 additions & 2 deletions learn/parse_http_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
as "github.com/akitasoftware/akita-ir/go/api_spec"
"github.com/akitasoftware/akita-libs/akinet"
"github.com/akitasoftware/akita-libs/spec_util"

"github.com/spf13/viper"
)

Expand All @@ -33,6 +32,7 @@ func init() {
var testBodyDict = `
{
"name": "prince",
"name_with_escaped_CRLF": "prince\r\n",
"number_teeth": 9000,
"dog": true,
"canadian_social_insurance_number": "378734493671000",
Expand Down Expand Up @@ -65,6 +65,7 @@ func newTestBodySpec(statusCode int) *as.Data {
func newTestBodySpecContentType(contentType string, statusCode int) *as.Data {
return newTestBodySpecFromStruct(statusCode, as.HTTPBody_JSON, contentType, map[string]*as.Data{
"name": dataFromPrimitive(spec_util.NewPrimitiveString("prince")),
"name_with_escaped_CRLF": dataFromPrimitive(spec_util.NewPrimitiveString("prince\r\n")),
"number_teeth": dataFromPrimitive(spec_util.NewPrimitiveInt64(9000)),
"dog": dataFromPrimitive(spec_util.NewPrimitiveBool(true)),
"canadian_social_insurance_number": dataFromPrimitive(annotateIfSensitiveForTest(true, spec_util.NewPrimitiveString("378734493671000"))),
Expand Down Expand Up @@ -590,6 +591,26 @@ func TestFailingParse(t *testing.T) {
[]*http.Cookie{},
),
},
{
Name: "json error: invalid '\t\r\n' in string literal",
TestContent: newTestHTTPResponse(
200,
[]byte("{\"field\t\r\n\": 3}"),
"application/json",
map[string][]string{},
[]*http.Cookie{},
),
},
{
Name: "json error: invalid '\t\r\r' not in string literal",
TestContent: newTestHTTPResponse(
200,
[]byte("{\"field\": 3\t\r\n}"),
"application/json",
map[string][]string{},
[]*http.Cookie{},
),
},
}
for _, tc := range testCases {
_, err := ParseHTTP(tc.TestContent)
Expand All @@ -600,5 +621,4 @@ func TestFailingParse(t *testing.T) {
}

viper.Set("debug", false)

}