Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

encoding: skip utf8 charset validation in some cases #31061

Merged
merged 4 commits into from
Dec 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion expression/collation.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ func CheckAndDeriveCollationFromExprs(ctx sessionctx.Context, funcName string, e
}

func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) bool {
enc := charset.FindEncoding(ec.Charset)
enc := charset.FindEncodingTakeUTF8AsNoop(ec.Charset)
for _, arg := range args {
if arg.GetType().Charset == ec.Charset {
continue
Expand Down
15 changes: 13 additions & 2 deletions parser/charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ func IsSupportedEncoding(charset string) bool {
return ok
}

// FindEncodingTakeUTF8AsNoop finds the encoding according to the charset
// except that utf-8 is treated as no-operation encoding. This is used to
// reduce the overhead of utf-8 validation in some cases.
func FindEncodingTakeUTF8AsNoop(charset string) Encoding {
enc := FindEncoding(charset)
if enc.Tp() == EncodingTpUTF8 {
return EncodingBinImpl
}
return enc
}

// FindEncoding finds the encoding according to charset.
func FindEncoding(charset string) Encoding {
if len(charset) == 0 {
Expand Down Expand Up @@ -104,7 +115,7 @@ const (
)

// CountValidBytes counts the first valid bytes in src that
// can be encode to the current encoding.
// can be encoded to the current encoding.
func CountValidBytes(e Encoding, src []byte) int {
nSrc := 0
e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
Expand All @@ -117,7 +128,7 @@ func CountValidBytes(e Encoding, src []byte) int {
}

// CountValidBytesDecode counts the first valid bytes in src that
// can be decode to utf-8.
// can be decoded to utf-8.
func CountValidBytesDecode(e Encoding, src []byte) int {
nSrc := 0
e.Foreach(src, opToUTF8, func(from, to []byte, ok bool) bool {
Expand Down
6 changes: 0 additions & 6 deletions parser/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,6 @@ func (s *Scanner) AppendWarn(err error) {
}

func (s *Scanner) tryDecodeToUTF8String(sql string) string {
if mysql.IsUTF8Charset(s.encoding.Name()) {
// Skip utf8 encoding because `ToUTF8` validates the whole SQL.
// This can cause failure when the SQL contains BLOB values.
// TODO: Convert charset on every token and use 'binary' encoding to decode token.
return sql
}
utf8Lit, err := s.encoding.Transform(nil, charset.Slice(sql), charset.OpDecodeReplace)
if err != nil {
s.AppendError(err)
Expand Down
4 changes: 2 additions & 2 deletions parser/yy_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ var (
func resetParams(p *Parser) {
p.charset = mysql.DefaultCharset
p.collation = mysql.DefaultCollationName
p.lexer.encoding = charset.EncodingUTF8Impl
p.lexer.encoding = charset.EncodingBinImpl
}

// ParseParam represents the parameter of parsing.
Expand Down Expand Up @@ -436,6 +436,6 @@ type CharsetClient string

// ApplyOn implements ParseParam interface.
func (c CharsetClient) ApplyOn(p *Parser) error {
p.lexer.encoding = charset.FindEncoding(string(c))
p.lexer.encoding = charset.FindEncodingTakeUTF8AsNoop(string(c))
return nil
}
6 changes: 3 additions & 3 deletions server/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ type inputDecoder struct {

func newInputDecoder(chs string) *inputDecoder {
return &inputDecoder{
encoding: charset.FindEncoding(chs),
encoding: charset.FindEncodingTakeUTF8AsNoop(chs),
buffer: nil,
}
}
Expand Down Expand Up @@ -336,7 +336,7 @@ type resultEncoder struct {
func newResultEncoder(chs string) *resultEncoder {
return &resultEncoder{
chsName: chs,
encoding: charset.FindEncoding(chs),
encoding: charset.FindEncodingTakeUTF8AsNoop(chs),
buffer: nil,
isBinary: chs == charset.CharsetBinary,
isNull: len(chs) == 0,
Expand All @@ -353,7 +353,7 @@ func (d *resultEncoder) updateDataEncoding(chsID uint16) {
if err != nil {
logutil.BgLogger().Warn("unknown charset ID", zap.Error(err))
}
d.dataEncoding = charset.FindEncoding(chs)
d.dataEncoding = charset.FindEncodingTakeUTF8AsNoop(chs)
d.dataIsBinary = chsID == mysql.BinaryDefaultCollationID
}

Expand Down