Skip to content

Commit

Permalink
unicode/utf16: add func RuneLen
Browse files Browse the repository at this point in the history
This CL adds func RuneLen, while here, also uses RuneLen to simplify
code in Encode.

Fixes #44940

Change-Id: Ifd3b537f69880dfd32a69a6733d8d3c2b5d4ecba
Reviewed-on: https://go-review.googlesource.com/c/go/+/569755
Reviewed-by: Ian Lance Taylor <[email protected]>
Reviewed-by: Michael Knyszek <[email protected]>
Commit-Queue: Ian Lance Taylor <[email protected]>
LUCI-TryBot-Result: Go LUCI <[email protected]>
Auto-Submit: Ian Lance Taylor <[email protected]>
  • Loading branch information
callthingsoff authored and gopherbot committed Mar 7, 2024
1 parent e0ba596 commit ef4f2a0
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 5 deletions.
1 change: 1 addition & 0 deletions api/next/44940.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pkg unicode/utf16, func RuneLen(int32) int #44940
3 changes: 3 additions & 0 deletions doc/next/6-stdlib/99-minor/unicode/utf16/44940.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
The [`unicode/utf16.RuneLen`](/pkg/unicode/utf16#RuneLen) function returns
the number of 16-bit words in the UTF-16 encoding of the rune. It returns -1
if the rune is not a valid value to encode in UTF-16.
3 changes: 3 additions & 0 deletions src/unicode/utf16/export_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ package utf16

// Extra names for constants so we can validate them during testing.
const (
Surr1 = surr1
Surr3 = surr3
SurrSelf = surrSelf
MaxRune = maxRune
ReplacementChar = replacementChar
)
21 changes: 16 additions & 5 deletions src/unicode/utf16/utf16.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ func EncodeRune(r rune) (r1, r2 rune) {
return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
}

// RuneLen returns the number of 16-bit words in the UTF-16 encoding of the rune.
// It returns -1 if the rune is not a valid value to encode in UTF-16.
func RuneLen(r rune) int {
switch {
case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
return 1
case surrSelf <= r && r <= maxRune:
return 2
default:
return -1
}
}

// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []rune) []uint16 {
n := len(s)
Expand All @@ -64,13 +77,11 @@ func Encode(s []rune) []uint16 {
a := make([]uint16, n)
n = 0
for _, v := range s {
switch {
case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
// normal rune
switch RuneLen(v) {
case 1: // normal rune
a[n] = uint16(v)
n++
case surrSelf <= v && v <= maxRune:
// needs surrogate sequence
case 2: // needs surrogate sequence
r1, r2 := EncodeRune(v)
a[n] = uint16(r1)
a[n+1] = uint16(r2)
Expand Down
20 changes: 20 additions & 0 deletions src/unicode/utf16/utf16_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@ func TestConstants(t *testing.T) {
}
}

func TestRuneLen(t *testing.T) {
for _, tt := range []struct {
r rune
length int
}{
{0, 1},
{Surr1 - 1, 1},
{Surr3, 1},
{SurrSelf - 1, 1},
{SurrSelf, 2},
{MaxRune, 2},
{MaxRune + 1, -1},
{-1, -1},
} {
if length := RuneLen(tt.r); length != tt.length {
t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, length, tt.length)
}
}
}

type encodeTest struct {
in []rune
out []uint16
Expand Down

0 comments on commit ef4f2a0

Please sign in to comment.