Allow printing codepoints by UTF-8 byte sequence

Should probably also add UTF-16, but it's annoying with the whole LE/BE variants, no one asked for it, and I'm never going to use it myself. So can't be bothered adding it right now 😅 Fixes #25
arp242 · Sep 23, 2021 · 23594a3 · 23594a3
1 parent 5615ea1
commit 23594a3
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 5 deletions.
diff --git a/.readme.gotxt b/.readme.gotxt
@@ -194,6 +194,22 @@ some other tool if you want to process the data further.
 ChangeLog
 ---------
 
+### v2.3.0 (unreleased)
+
+- Update to Unicode 14.0.
+
+- UTF-16 and JSON are printed as lower case, just like UTF-8 was. Upper-case is
+  used only for codepoints (i.e. U+00AC).
+
+- `uni print` can now print from UTF-8 byte sequence; for example to print the €
+  sign:
+
+      uni p utf8:e282ac
+      uni p 'utf8:e2 82 ac'
+      uni p 'utf8:0xe2 0x82 0xac'
+
+  Bytes can optionally be separated by any combination of `0x`, `-`, `_`, or spaces.
+
 ### v2.2.1 (2021-06-15)
 
 - You can now use `uni p 0d40` to get U+28 by decimal.

diff --git a/README.markdown b/README.markdown
@@ -395,6 +395,22 @@ some other tool if you want to process the data further.
 ChangeLog
 ---------
 
+### v2.3.0 (unreleased)
+
+- Update to Unicode 14.0.
+
+- UTF-16 and JSON are printed as lower case, just like UTF-8 was. Upper-case is
+  used only for codepoints (i.e. U+00AC).
+
+- `uni print` can now print from UTF-8 byte sequence; for example to print the €
+  sign:
+
+      uni p utf8:e282ac
+      uni p 'utf8:e2 82 ac'
+      uni p 'utf8:0xe2 0x82 0xac'
+
+  Bytes can optionally be separated by any combination of `0x`, `-`, `_`, or spaces.
+
 ### v2.2.1 (2021-06-15)
 
 - You can now use `uni p 0d40` to get U+28 by decimal.

diff --git a/uni.go b/uni.go
@@ -78,6 +78,13 @@ Commands:
                        Range                  U+2042..U+2050, 0o101..0x5a
                        Categories and Blocks  OtherPunctuation, Po,
                                               GeneralPunctuation
+                       UTF-8                  UTF-8 byte sequence, optionally
+                                              separated by any combination of
+                                              '0x', '-', '_', or spaces:
+                                                utf8:e282ac
+                                                utf8:0xe20x820xac
+                                                'utf8:e2 82 ac'
+                                                'utf8:0xe2 0x82 0xac'
                        all                    Everything
 
     emoji [query]    Search emojis.
@@ -400,12 +407,47 @@ func search(args []string, format string, quiet, raw, asJSON, or bool) error {
 	return nil
 }
 
+var utfClean = strings.NewReplacer("0x", "", " ", "", "_", "", "-", "")
+
 func print(args []string, format string, quiet, raw, asJSON bool) error {
 	f, err := NewFormat(format, asJSON, !quiet, knownColumns...)
 	if err != nil {
 		return err
 	}
 	for _, a := range args {
+		a = strings.ToLower(a)
+
+		// UTF-8
+		if strings.HasPrefix(a, "utf8:") {
+			a = a[5:]
+
+			seq := utfClean.Replace(a)
+			if len(seq)%2 == 1 {
+				seq = "0" + seq
+			}
+
+			byt := make([]byte, 0, len(seq)/2)
+			for i := 0; len(seq) > i; i += 2 {
+				b, err := strconv.ParseUint(seq[i:i+2], 16, 8)
+				if err != nil {
+					return fmt.Errorf("invalid UTF-8 sequence %q: %q is not a hex number",
+						a, seq[i:i+2])
+				}
+				byt = append(byt, byte(b))
+			}
+
+			r, s := utf8.DecodeRune(byt)
+			if r == utf8.RuneError {
+				return fmt.Errorf("invalid UTF-8 sequence: %q", a)
+			}
+			if s != len(byt) {
+				return fmt.Errorf("multiple characters in sequence %q", a)
+			}
+
+			f.Line(toLine(unidata.Codepoints[r], raw))
+			continue
+		}
+
 		canon := unidata.CanonicalCategory(a)
 
 		// Print everything.

diff --git a/uni_test.go b/uni_test.go
@@ -164,6 +164,14 @@ func TestPrint(t *testing.T) {
 		{[]string{"-q", "p", "all"}, "ASTERISM", 34626, -1},
 
 		{[]string{"-q", "-r", "p", "U9"}, "'\t'", 1, -1},
+
+		// UTF-8
+		{[]string{"-q", "p", "utf8:75"}, "'u'", 1, -1},
+		{[]string{"-q", "p", "UTF8:75"}, "'u'", 1, -1},
+		{[]string{"-q", "p", "utf8:e282ac"}, "'€'", 1, -1},
+		{[]string{"-q", "p", "utf8:e2 82 ac"}, "'€'", 1, -1},
+		{[]string{"-q", "p", "utf8:0xe20x820xac"}, "'€'", 1, -1},
+		{[]string{"-q", "p", "utf8:0xE2 0x82 0xAC"}, "'€'", 1, -1},
 	}
 
 	for _, tt := range tests {
@@ -177,7 +185,7 @@ func TestPrint(t *testing.T) {
 			}()
 
 			if int(*exit) != tt.wantExit {
-				t.Fatalf("wrong exit: %d", *exit)
+				t.Fatalf("exit %d: %s", *exit, outbuf.String())
 			}
 
 			out := outbuf.String()
@@ -336,12 +344,12 @@ func TestJSON(t *testing.T) {
 	"digraph": "=e",
 	"hex": "20ac",
 	"html": "&euro;",
-	"json": "\\u20AC",
+	"json": "\\u20ac",
 	"keysym": "EuroSign",
 	"name": "EURO SIGN",
 	"plane": "Basic Multilingual Plane",
-	"utf16be": "20 AC",
-	"utf16le": "AC 20",
+	"utf16be": "20 ac",
+	"utf16le": "ac 20",
 	"utf8": "e2 82 ac",
 	"width": "ambiguous",
 	"xml": "&#x20ac;"

diff --git a/unidata/unidata.go b/unidata/unidata.go
@@ -178,7 +178,7 @@ func (c Codepoint) UTF16(bigEndian bool) string {
 			p[1], p[0], p[3], p[2] = p[0], p[1], p[2], p[3]
 		}
 	}
-	return fmt.Sprintf(`% X`, p)
+	return fmt.Sprintf(`% x`, p)
 }
 
 func (c Codepoint) XMLEntity() string {