diff --git a/.readme.gotxt b/.readme.gotxt index 7313fb1..d73888c 100644 --- a/.readme.gotxt +++ b/.readme.gotxt @@ -194,6 +194,22 @@ some other tool if you want to process the data further. ChangeLog --------- +### v2.3.0 (unreleased) + +- Update to Unicode 14.0. + +- UTF-16 and JSON are printed as lower case, just like UTF-8 was. Upper-case is + used only for codepoints (i.e. U+00AC). + +- `uni print` can now print from UTF-8 byte sequence; for example to print the € + sign: + + uni p utf8:e282ac + uni p 'utf8:e2 82 ac' + uni p 'utf8:0xe2 0x82 0xac' + + Bytes can optionally be separated by any combination of `0x`, `-`, `_`, or spaces. + ### v2.2.1 (2021-06-15) - You can now use `uni p 0d40` to get U+28 by decimal. diff --git a/README.markdown b/README.markdown index 67d8118..69ab77d 100644 --- a/README.markdown +++ b/README.markdown @@ -395,6 +395,22 @@ some other tool if you want to process the data further. ChangeLog --------- +### v2.3.0 (unreleased) + +- Update to Unicode 14.0. + +- UTF-16 and JSON are printed as lower case, just like UTF-8 was. Upper-case is + used only for codepoints (i.e. U+00AC). + +- `uni print` can now print from UTF-8 byte sequence; for example to print the € + sign: + + uni p utf8:e282ac + uni p 'utf8:e2 82 ac' + uni p 'utf8:0xe2 0x82 0xac' + + Bytes can optionally be separated by any combination of `0x`, `-`, `_`, or spaces. + ### v2.2.1 (2021-06-15) - You can now use `uni p 0d40` to get U+28 by decimal. diff --git a/uni.go b/uni.go index cd47441..5184f17 100644 --- a/uni.go +++ b/uni.go @@ -78,6 +78,13 @@ Commands: Range U+2042..U+2050, 0o101..0x5a Categories and Blocks OtherPunctuation, Po, GeneralPunctuation + UTF-8 UTF-8 byte sequence, optionally + separated by any combination of + '0x', '-', '_', or spaces: + utf8:e282ac + utf8:0xe20x820xac + 'utf8:e2 82 ac' + 'utf8:0xe2 0x82 0xac' all Everything emoji [query] Search emojis. @@ -400,12 +407,47 @@ func search(args []string, format string, quiet, raw, asJSON, or bool) error { return nil } +var utfClean = strings.NewReplacer("0x", "", " ", "", "_", "", "-", "") + func print(args []string, format string, quiet, raw, asJSON bool) error { f, err := NewFormat(format, asJSON, !quiet, knownColumns...) if err != nil { return err } for _, a := range args { + a = strings.ToLower(a) + + // UTF-8 + if strings.HasPrefix(a, "utf8:") { + a = a[5:] + + seq := utfClean.Replace(a) + if len(seq)%2 == 1 { + seq = "0" + seq + } + + byt := make([]byte, 0, len(seq)/2) + for i := 0; len(seq) > i; i += 2 { + b, err := strconv.ParseUint(seq[i:i+2], 16, 8) + if err != nil { + return fmt.Errorf("invalid UTF-8 sequence %q: %q is not a hex number", + a, seq[i:i+2]) + } + byt = append(byt, byte(b)) + } + + r, s := utf8.DecodeRune(byt) + if r == utf8.RuneError { + return fmt.Errorf("invalid UTF-8 sequence: %q", a) + } + if s != len(byt) { + return fmt.Errorf("multiple characters in sequence %q", a) + } + + f.Line(toLine(unidata.Codepoints[r], raw)) + continue + } + canon := unidata.CanonicalCategory(a) // Print everything. diff --git a/uni_test.go b/uni_test.go index 97ccc8c..856c987 100644 --- a/uni_test.go +++ b/uni_test.go @@ -164,6 +164,14 @@ func TestPrint(t *testing.T) { {[]string{"-q", "p", "all"}, "ASTERISM", 34626, -1}, {[]string{"-q", "-r", "p", "U9"}, "'\t'", 1, -1}, + + // UTF-8 + {[]string{"-q", "p", "utf8:75"}, "'u'", 1, -1}, + {[]string{"-q", "p", "UTF8:75"}, "'u'", 1, -1}, + {[]string{"-q", "p", "utf8:e282ac"}, "'€'", 1, -1}, + {[]string{"-q", "p", "utf8:e2 82 ac"}, "'€'", 1, -1}, + {[]string{"-q", "p", "utf8:0xe20x820xac"}, "'€'", 1, -1}, + {[]string{"-q", "p", "utf8:0xE2 0x82 0xAC"}, "'€'", 1, -1}, } for _, tt := range tests { @@ -177,7 +185,7 @@ func TestPrint(t *testing.T) { }() if int(*exit) != tt.wantExit { - t.Fatalf("wrong exit: %d", *exit) + t.Fatalf("exit %d: %s", *exit, outbuf.String()) } out := outbuf.String() @@ -336,12 +344,12 @@ func TestJSON(t *testing.T) { "digraph": "=e", "hex": "20ac", "html": "€", - "json": "\\u20AC", + "json": "\\u20ac", "keysym": "EuroSign", "name": "EURO SIGN", "plane": "Basic Multilingual Plane", - "utf16be": "20 AC", - "utf16le": "AC 20", + "utf16be": "20 ac", + "utf16le": "ac 20", "utf8": "e2 82 ac", "width": "ambiguous", "xml": "€" diff --git a/unidata/unidata.go b/unidata/unidata.go index 15802a2..70301b9 100644 --- a/unidata/unidata.go +++ b/unidata/unidata.go @@ -178,7 +178,7 @@ func (c Codepoint) UTF16(bigEndian bool) string { p[1], p[0], p[3], p[2] = p[0], p[1], p[2], p[3] } } - return fmt.Sprintf(`% X`, p) + return fmt.Sprintf(`% x`, p) } func (c Codepoint) XMLEntity() string {