Skip to content

Commit

Permalink
Allow printing codepoints by UTF-8 byte sequence
Browse files Browse the repository at this point in the history
Should probably also add UTF-16, but it's annoying with the whole LE/BE
variants, no one asked for it, and I'm never going to use it myself. So
can't be bothered adding it right now 😅

Fixes #25
  • Loading branch information
arp242 committed Sep 23, 2021
1 parent 5615ea1 commit 23594a3
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 5 deletions.
16 changes: 16 additions & 0 deletions .readme.gotxt
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,22 @@ some other tool if you want to process the data further.
ChangeLog
---------

### v2.3.0 (unreleased)

- Update to Unicode 14.0.

- UTF-16 and JSON are printed as lower case, just like UTF-8 was. Upper-case is
used only for codepoints (i.e. U+00AC).

- `uni print` can now print from UTF-8 byte sequence; for example to print the €
sign:

uni p utf8:e282ac
uni p 'utf8:e2 82 ac'
uni p 'utf8:0xe2 0x82 0xac'

Bytes can optionally be separated by any combination of `0x`, `-`, `_`, or spaces.

### v2.2.1 (2021-06-15)

- You can now use `uni p 0d40` to get U+28 by decimal.
Expand Down
16 changes: 16 additions & 0 deletions README.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,22 @@ some other tool if you want to process the data further.
ChangeLog
---------

### v2.3.0 (unreleased)

- Update to Unicode 14.0.

- UTF-16 and JSON are printed as lower case, just like UTF-8 was. Upper-case is
used only for codepoints (i.e. U+00AC).

- `uni print` can now print from UTF-8 byte sequence; for example to print the €
sign:

uni p utf8:e282ac
uni p 'utf8:e2 82 ac'
uni p 'utf8:0xe2 0x82 0xac'

Bytes can optionally be separated by any combination of `0x`, `-`, `_`, or spaces.

### v2.2.1 (2021-06-15)

- You can now use `uni p 0d40` to get U+28 by decimal.
Expand Down
42 changes: 42 additions & 0 deletions uni.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ Commands:
Range U+2042..U+2050, 0o101..0x5a
Categories and Blocks OtherPunctuation, Po,
GeneralPunctuation
UTF-8 UTF-8 byte sequence, optionally
separated by any combination of
'0x', '-', '_', or spaces:
utf8:e282ac
utf8:0xe20x820xac
'utf8:e2 82 ac'
'utf8:0xe2 0x82 0xac'
all Everything
emoji [query] Search emojis.
Expand Down Expand Up @@ -400,12 +407,47 @@ func search(args []string, format string, quiet, raw, asJSON, or bool) error {
return nil
}

var utfClean = strings.NewReplacer("0x", "", " ", "", "_", "", "-", "")

func print(args []string, format string, quiet, raw, asJSON bool) error {
f, err := NewFormat(format, asJSON, !quiet, knownColumns...)
if err != nil {
return err
}
for _, a := range args {
a = strings.ToLower(a)

// UTF-8
if strings.HasPrefix(a, "utf8:") {
a = a[5:]

seq := utfClean.Replace(a)
if len(seq)%2 == 1 {
seq = "0" + seq
}

byt := make([]byte, 0, len(seq)/2)
for i := 0; len(seq) > i; i += 2 {
b, err := strconv.ParseUint(seq[i:i+2], 16, 8)
if err != nil {
return fmt.Errorf("invalid UTF-8 sequence %q: %q is not a hex number",
a, seq[i:i+2])
}
byt = append(byt, byte(b))
}

r, s := utf8.DecodeRune(byt)
if r == utf8.RuneError {
return fmt.Errorf("invalid UTF-8 sequence: %q", a)
}
if s != len(byt) {
return fmt.Errorf("multiple characters in sequence %q", a)
}

f.Line(toLine(unidata.Codepoints[r], raw))
continue
}

canon := unidata.CanonicalCategory(a)

// Print everything.
Expand Down
16 changes: 12 additions & 4 deletions uni_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,14 @@ func TestPrint(t *testing.T) {
{[]string{"-q", "p", "all"}, "ASTERISM", 34626, -1},

{[]string{"-q", "-r", "p", "U9"}, "'\t'", 1, -1},

// UTF-8
{[]string{"-q", "p", "utf8:75"}, "'u'", 1, -1},
{[]string{"-q", "p", "UTF8:75"}, "'u'", 1, -1},
{[]string{"-q", "p", "utf8:e282ac"}, "'€'", 1, -1},
{[]string{"-q", "p", "utf8:e2 82 ac"}, "'€'", 1, -1},
{[]string{"-q", "p", "utf8:0xe20x820xac"}, "'€'", 1, -1},
{[]string{"-q", "p", "utf8:0xE2 0x82 0xAC"}, "'€'", 1, -1},
}

for _, tt := range tests {
Expand All @@ -177,7 +185,7 @@ func TestPrint(t *testing.T) {
}()

if int(*exit) != tt.wantExit {
t.Fatalf("wrong exit: %d", *exit)
t.Fatalf("exit %d: %s", *exit, outbuf.String())
}

out := outbuf.String()
Expand Down Expand Up @@ -336,12 +344,12 @@ func TestJSON(t *testing.T) {
"digraph": "=e",
"hex": "20ac",
"html": "€",
"json": "\\u20AC",
"json": "\\u20ac",
"keysym": "EuroSign",
"name": "EURO SIGN",
"plane": "Basic Multilingual Plane",
"utf16be": "20 AC",
"utf16le": "AC 20",
"utf16be": "20 ac",
"utf16le": "ac 20",
"utf8": "e2 82 ac",
"width": "ambiguous",
"xml": "€"
Expand Down
2 changes: 1 addition & 1 deletion unidata/unidata.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ func (c Codepoint) UTF16(bigEndian bool) string {
p[1], p[0], p[3], p[2] = p[0], p[1], p[2], p[3]
}
}
return fmt.Sprintf(`% X`, p)
return fmt.Sprintf(`% x`, p)
}

func (c Codepoint) XMLEntity() string {
Expand Down

0 comments on commit 23594a3

Please sign in to comment.