From 9f2242e6ab5152381fbc705f9ff646129c532fdb Mon Sep 17 00:00:00 2001 From: Yi Duan Date: Wed, 24 Jan 2024 20:29:48 +0800 Subject: [PATCH] fix: always copy once after top get (#577) --- README.md | 3 ++- api.go | 29 +++++++++++++++++++------ ast/api_amd64.go | 24 --------------------- ast/api_compat.go | 33 +++++++++------------------- ast/search.go | 50 +++++++++++++++++++++++++++++++++++++++++++ fuzz/ast_fuzz_test.go | 2 +- 6 files changed, 85 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 5613609da..9527b71fe 100644 --- a/README.md +++ b/README.md @@ -365,7 +365,8 @@ func init() { ``` ### Copy string -When decoding **string values without any escaped characters**, sonic references them from the origin JSON buffer instead of mallocing a new buffer to copy. This helps a lot for CPU performance but may leave the whole JSON buffer in memory as long as the decoded objects are being used. In practice, we found the extra memory introduced by referring JSON buffer is usually 20% ~ 80% of decoded objects. Once an application holds these objects for a long time (for example, cache the decoded objects for reusing), its in-use memory on the server may go up. We provide the option `decoder.CopyString()` for users to choose not to reference the JSON buffer, which may cause a decline in CPU performance to some degree. +When decoding **string values without any escaped characters**, sonic references them from the origin JSON buffer instead of mallocing a new buffer to copy. This helps a lot for CPU performance but may leave the whole JSON buffer in memory as long as the decoded objects are being used. In practice, we found the extra memory introduced by referring JSON buffer is usually 20% ~ 80% of decoded objects. Once an application holds these objects for a long time (for example, cache the decoded objects for reusing), its in-use memory on the server may go up. - `Config.CopyString`/`decoder.CopyString()`: We provide the option for `Decode()` / `Unmarshal()` users to choose not to reference the JSON buffer, which may cause a decline in CPU performance to some degree. +- `GetFromStringNoCopy()`: For memory safty, `sonic.Get()` / `sonic.GetFromString()` now copies return JSON. If users want to get json more quickly and not care about memory usage, you can use `GetFromStringNoCopy()` to return a JSON direclty referenced from source. ### Pass string or []byte? For alignment to `encoding/json`, we provide API to pass `[]byte` as an argument, but the string-to-bytes copy is conducted at the same time considering safety, which may lose performance when the origin JSON is huge. Therefore, you can use `UnmarshalString()` and `GetFromString()` to pass a string, as long as your origin data is a string or **nocopy-cast** is safe for your []byte. We also provide API `MarshalString()` for convenient **nocopy-cast** of encoded JSON []byte, which is safe since sonic's output bytes is always duplicated and unique. diff --git a/api.go b/api.go index ea73fe168..54d9a2160 100644 --- a/api.go +++ b/api.go @@ -20,6 +20,7 @@ import ( `io` `github.com/bytedance/sonic/ast` + `github.com/bytedance/sonic/internal/rt` ) // Config is a combination of sonic/encoder.Options and sonic/decoder.Options @@ -173,27 +174,41 @@ func UnmarshalString(buf string, val interface{}) error { return ConfigDefault.UnmarshalFromString(buf, val) } -// Get searches the given path from json, -// and returns its representing ast.Node. +// Get searches and locates the given path from src json, +// and returns a ast.Node representing the partially json. // // Each path arg must be integer or string: // - Integer is target index(>=0), means searching current node as array. // - String is target key, means searching current node as object. // // -// Note, the api expects the json is well-formed at least, -// otherwise it may return unexpected result. +// Notice: It expects the src json is **Well-formed** and **Immutable** when calling, +// otherwise it may return unexpected result. +// Considering memory safty, the returned JSON is **Copied** from the input func Get(src []byte, path ...interface{}) (ast.Node, error) { - return GetFromString(string(src), path...) + return GetCopyFromString(rt.Mem2Str(src), path...) } -// GetFromString is same with Get except src is string, -// which can reduce unnecessary memory copy. +// GetFromString is same with Get except src is string. +// +// WARNING: The returned JSON is **Referenced** from the input. +// Caching or long-time holding the returned node may cause OOM. +// If your src is big, consider use GetFromStringCopy(). func GetFromString(src string, path ...interface{}) (ast.Node, error) { return ast.NewSearcher(src).GetByPath(path...) } +// GetCopyFromString is same with Get except src is string +func GetCopyFromString(src string, path ...interface{}) (ast.Node, error) { + return ast.NewSearcher(src).GetByPathCopy(path...) +} + // Valid reports whether data is a valid JSON encoding. func Valid(data []byte) bool { return ConfigDefault.Valid(data) } + +// Valid reports whether data is a valid JSON encoding. +func ValidString(data string) bool { + return ConfigDefault.Valid(rt.Str2Mem(data)) +} diff --git a/ast/api_amd64.go b/ast/api_amd64.go index da6738efd..15a22b7ba 100644 --- a/ast/api_amd64.go +++ b/ast/api_amd64.go @@ -131,27 +131,3 @@ func (self *Parser) getByPath(path ...interface{}) (int, types.ParsingError) { } return start, 0 } - -func (self *Searcher) GetByPath(path ...interface{}) (Node, error) { - var err types.ParsingError - var start int - - self.parser.p = 0 - start, err = self.parser.getByPath(path...) - if err != 0 { - // for compatibility with old version - if err == types.ERR_NOT_FOUND { - return Node{}, ErrNotExist - } - if err == types.ERR_UNSUPPORT_TYPE { - panic("path must be either int(>=0) or string") - } - return Node{}, self.parser.syntaxError(err) - } - - t := switchRawType(self.parser.s[start]) - if t == _V_NONE { - return Node{}, self.parser.ExportError(err) - } - return newRawNode(self.parser.s[start:self.parser.p], t), nil -} diff --git a/ast/api_compat.go b/ast/api_compat.go index 4c5f74309..ce7b08a63 100644 --- a/ast/api_compat.go +++ b/ast/api_compat.go @@ -21,7 +21,6 @@ package ast import ( `encoding/base64` `encoding/json` - `fmt` `github.com/bytedance/sonic/internal/native/types` `github.com/bytedance/sonic/internal/rt` @@ -88,37 +87,25 @@ func (self *Node) encodeInterface(buf *[]byte) error { return nil } -func (self *Searcher) GetByPath(path ...interface{}) (Node, error) { - self.parser.p = 0 - +func (self *Parser) getByPath(path ...interface{}) (int, types.ParsingError) { var err types.ParsingError for _, p := range path { if idx, ok := p.(int); ok && idx >= 0 { - if err = self.parser.searchIndex(idx); err != 0 { - return Node{}, self.parser.ExportError(err) + if err = self.searchIndex(idx); err != 0 { + return -1, err } } else if key, ok := p.(string); ok { - if err = self.parser.searchKey(key); err != 0 { - return Node{}, self.parser.ExportError(err) + if err = self.searchKey(key); err != 0 { + return -1, err } } else { panic("path must be either int(>=0) or string") } } - var start = self.parser.p - if start, err = self.parser.skip(); err != 0 { - return Node{}, self.parser.ExportError(err) - } - ns := len(self.parser.s) - if self.parser.p > ns || start >= ns || start>=self.parser.p { - return Node{}, fmt.Errorf("skip %d char out of json boundary", start) - } - - t := switchRawType(self.parser.s[start]) - if t == _V_NONE { - return Node{}, self.parser.ExportError(err) + var start int + if start, err = self.skip(); err != 0 { + return -1, err } - - return newRawNode(self.parser.s[start:self.parser.p], t), nil -} \ No newline at end of file + return start, 0 +} diff --git a/ast/search.go b/ast/search.go index bb6fceaa7..7108e7ea6 100644 --- a/ast/search.go +++ b/ast/search.go @@ -16,6 +16,11 @@ package ast +import ( + `github.com/bytedance/sonic/internal/rt` + `github.com/bytedance/sonic/internal/native/types` +) + type Searcher struct { parser Parser } @@ -28,3 +33,48 @@ func NewSearcher(str string) *Searcher { }, } } + +// GetByPathCopy search in depth from top json and returns a **Copied** json node at the path location +func (self *Searcher) GetByPathCopy(path ...interface{}) (Node, error) { + return self.getByPath(true, path...) +} + +// GetByPathNoCopy search in depth from top json and returns a **Referenced** json node at the path location +// +// WARN: this search directly refer partial json from top json, which has faster speed, +// may consumes more memory. +func (self *Searcher) GetByPath(path ...interface{}) (Node, error) { + return self.getByPath(false, path...) +} + +func (self *Searcher) getByPath(copystring bool, path ...interface{}) (Node, error) { + var err types.ParsingError + var start int + + self.parser.p = 0 + start, err = self.parser.getByPath(path...) + if err != 0 { + // for compatibility with old version + if err == types.ERR_NOT_FOUND { + return Node{}, ErrNotExist + } + if err == types.ERR_UNSUPPORT_TYPE { + panic("path must be either int(>=0) or string") + } + return Node{}, self.parser.syntaxError(err) + } + + t := switchRawType(self.parser.s[start]) + if t == _V_NONE { + return Node{}, self.parser.ExportError(err) + } + + // copy string to reducing memory usage + var raw string + if copystring { + raw = rt.Mem2Str([]byte(self.parser.s[start:self.parser.p])) + } else { + raw = self.parser.s[start:self.parser.p] + } + return newRawNode(raw, t), nil +} diff --git a/fuzz/ast_fuzz_test.go b/fuzz/ast_fuzz_test.go index 1eea64e33..231d5f620 100644 --- a/fuzz/ast_fuzz_test.go +++ b/fuzz/ast_fuzz_test.go @@ -28,7 +28,7 @@ import ( // data is random, check whether is panic func fuzzAst(t *testing.T, data []byte) { - sonic.Get(data) + sonic.GetFromString(string(data)) } func fuzzASTGetFromObject(t *testing.T, data []byte, m map[string]interface{}) {