From f8033f868f6585007ac140b3a080fec729621b91 Mon Sep 17 00:00:00 2001 From: zensh Date: Sat, 28 Jan 2023 21:45:58 +0800 Subject: [PATCH] Fix UTF-8 text diagnostic encoding. --- diagnose.go | 66 +++++++++++++++++++++++++++---------- diagnose_test.go | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 18 deletions(-) diff --git a/diagnose.go b/diagnose.go index aaea4848..6403f985 100644 --- a/diagnose.go +++ b/diagnose.go @@ -66,6 +66,7 @@ func Diag(data []byte, opts *DiagOptions) ([]byte, error) { return di.diag() } +// loosest decode options for diagnostic purpose. var diagnoseDecMode, _ = DecOptions{ MaxNestedLevels: 256, UTF8: UTF8DecodeInvalid, @@ -446,43 +447,72 @@ func (di *diagnose) encodeByteString(val []byte) error { var utf16SurrSelf = rune(0x10000) // quote should be either `'` or `"` -func (di *diagnose) encodeTextString(val string, quote rune) error { - if err := di.writeByte(byte(quote)); err != nil { +func (di *diagnose) encodeTextString(val string, quote byte) error { + if err := di.writeByte(quote); err != nil { return err } - for _, r := range val { - switch { - case r == '\t', r == '\n', r == '\r', r == '\\', r == quote: - if err := di.writeByte('\\'); err != nil { - return err - } - if err := di.writeByte(byte(r)); err != nil { - return err + for i := 0; i < len(val); { + if b := val[i]; b < utf8.RuneSelf { + switch { + case b == '\t', b == '\n', b == '\r', b == '\\', b == quote: + if err := di.writeByte('\\'); err != nil { + return err + } + + switch b { + case '\t': + b = 't' + case '\n': + b = 'n' + case '\r': + b = 'r' + } + if err := di.writeByte(b); err != nil { + return err + } + + case b >= ' ' && b <= '~': + if err := di.writeByte(b); err != nil { + return err + } + + default: + if err := di.writeU16(rune(b)); err != nil { + return err + } } - case r >= ' ' && r <= '~': - if err := di.writeByte(byte(r)); err != nil { + i++ + continue + } + + c, size := utf8.DecodeRuneInString(val[i:]) + switch { + case c == utf8.RuneError: + if err := di.writeU16(rune(val[i])); err != nil { return err } - case r < utf16SurrSelf: - if err := di.writeU16(r); err != nil { + case c < utf16SurrSelf: + if err := di.writeU16(c); err != nil { return err } default: - r1, r2 := utf16.EncodeRune(r) - if err := di.writeU16(r1); err != nil { + c1, c2 := utf16.EncodeRune(c) + if err := di.writeU16(c1); err != nil { return err } - if err := di.writeU16(r2); err != nil { + if err := di.writeU16(c2); err != nil { return err } } + + i += size } - return di.writeByte(byte(quote)) + return di.writeByte(quote) } func (di *diagnose) encodeFloat(ai byte, val uint64) error { diff --git a/diagnose_test.go b/diagnose_test.go index d27ec731..770eb5eb 100644 --- a/diagnose_test.go +++ b/diagnose_test.go @@ -549,6 +549,92 @@ func TestDiagnoseByteString(t *testing.T) { }) } +func TestDiagnoseTextString(t *testing.T) { + testCases := []struct { + title string + cbor []byte + diag string + opts *DiagOptions + }{ + { + "valid UTF-8 text in byte string", + hexDecode("4d68656c6c6f2c20e4bda0e5a5bd"), + `'hello, \u4f60\u597d'`, + &DiagOptions{ + ByteStringText: true, + }, + }, + { + "valid UTF-8 text in text string", + hexDecode("6d68656c6c6f2c20e4bda0e5a5bd"), + `"hello, \u4f60\u597d"`, // "hello, 你好" + &DiagOptions{ + ByteStringText: true, + }, + }, + { + "invalid UTF-8 text in byte string", + hexDecode("4d68656c6c6fffeee4bda0e5a5bd"), + `h'68656c6c6fffeee4bda0e5a5bd'`, + &DiagOptions{ + ByteStringText: true, + }, + }, + { + "invalid UTF-8 text in text string", + hexDecode("6d68656c6c6fffeee4bda0e5a5bd"), + `"hello\u00ff\u00ee\u4f60\u597d"`, + &DiagOptions{ + ByteStringText: true, + }, + }, + { + "valid grapheme cluster text in byte string", + hexDecode("583448656c6c6f2c2027e29da4efb88fe2808df09f94a5270ae4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122"), + `'Hello, \'\u2764\ufe0f\u200d\ud83d\udd25\'\n\u4f60\u597d\uff0c"\ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1"'`, + &DiagOptions{ + ByteStringText: true, + }, + }, + { + "valid grapheme cluster text in text string", + hexDecode("783448656c6c6f2c2027e29da4efb88fe2808df09f94a5270ae4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122"), + `"Hello, '\u2764\ufe0f\u200d\ud83d\udd25'\n\u4f60\u597d\uff0c\"\ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1\""`, // "Hello, '❤️‍🔥'\n你好,\"🧑‍🤝‍🧑\"" + &DiagOptions{ + ByteStringText: true, + }, + }, + { + "invalid grapheme cluster text in byte string", + hexDecode("583448656c6c6feeff27e29da4efb88fe2808df09f94a5270de4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122"), + `h'48656c6c6feeff27e29da4efb88fe2808df09f94a5270de4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122'`, + &DiagOptions{ + ByteStringText: true, + }, + }, + { + "invalid grapheme cluster text in text string", + hexDecode("783448656c6c6feeff27e29da4efb88fe2808df09f94a5270de4bda0e5a5bdefbc8c22f09fa791e2808df09fa49de2808df09fa79122"), + `"Hello\u00ee\u00ff'\u2764\ufe0f\u200d\ud83d\udd25'\r\u4f60\u597d\uff0c\"\ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1\""`, + &DiagOptions{ + ByteStringText: true, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.title, func(t *testing.T) { + + data, err := Diag(tc.cbor, tc.opts) + if err != nil { + t.Errorf("Diag(0x%x) returned error %q", tc.cbor, err) + } else if string(data) != tc.diag { + t.Errorf("Diag(0x%x) returned `%s`, want %s", tc.cbor, string(data), tc.diag) + } + }) + } +} + func TestDiagnoseFloatingPointNumber(t *testing.T) { testCases := []struct { title string