Skip to content

Commit

Permalink
bytes, strings: fix regression in IndexRune
Browse files Browse the repository at this point in the history
In all previous versions of Go, the behavior of IndexRune(s, r)
where r was utf.RuneError was that it would effectively return the
index of any invalid UTF-8 byte sequence (include RuneError).
Optimizations made in http://golang.org/cl/28537 and
http://golang.org/cl/28546 altered this undocumented behavior such
that RuneError would only match on the RuneError rune itself.

Although, the new behavior is arguably reasonable, it did break code
that depended on the previous behavior. Thus, we add special checks
to ensure that we preserve the old behavior.

There is a slight performance hit for correctness:
	benchmark                   old ns/op     new ns/op     delta
	BenchmarkIndexRune/10-4     19.3          21.6          +11.92%
	BenchmarkIndexRune/32-4     33.6          35.2          +4.76%
This only occurs on small strings. The performance hit for larger strings
is neglible and not shown.

Fixes #17611

Change-Id: I1d863a741213d46c40b2e1724c41245df52502a5
Reviewed-on: https://go-review.googlesource.com/32123
Run-TryBot: Joe Tsai <[email protected]>
TryBot-Result: Gobot Gobot <[email protected]>
Reviewed-by: Brad Fitzpatrick <[email protected]>
  • Loading branch information
dsnet committed Oct 26, 2016
1 parent 4f1e7be commit 4b26657
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 40 deletions.
23 changes: 19 additions & 4 deletions src/bytes/bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,28 @@ func LastIndexByte(s []byte, c byte) int {
// IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
// It returns the byte index of the first occurrence in s of the given rune.
// It returns -1 if rune is not present in s.
// If r is utf8.RuneError, it returns the first instance of any
// invalid UTF-8 byte sequence.
func IndexRune(s []byte, r rune) int {
if r < utf8.RuneSelf {
switch {
case 0 <= r && r < utf8.RuneSelf:
return IndexByte(s, byte(r))
case r == utf8.RuneError:
for i := 0; i < len(s); {
r1, n := utf8.DecodeRune(s[i:])
if r1 == utf8.RuneError {
return i
}
i += n
}
return -1
case !utf8.ValidRune(r):
return -1
default:
var b [utf8.UTFMax]byte
n := utf8.EncodeRune(b[:], r)
return Index(s, b[:n])
}
var b [utf8.UTFMax]byte
n := utf8.EncodeRune(b[:], r)
return Index(s, b[:n])
}

// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
Expand Down
50 changes: 33 additions & 17 deletions src/bytes/bytes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,15 +185,6 @@ var lastIndexAnyTests = []BinOpTest{
{dots + dots + dots, " ", -1},
}

var indexRuneTests = []BinOpTest{
{"", "a", -1},
{"", "☺", -1},
{"foo", "☹", -1},
{"foo", "o", 1},
{"foo☺bar", "☺", 3},
{"foo☺☻☹bar", "☹", 9},
}

// Execute f on each test case. funcName should be the name of f; it's used
// in failure reports.
func runIndexTests(t *testing.T, f func(s, sep []byte) int, funcName string, testCases []BinOpTest) {
Expand Down Expand Up @@ -348,17 +339,42 @@ func TestIndexByteSmall(t *testing.T) {
}

func TestIndexRune(t *testing.T) {
for _, tt := range indexRuneTests {
a := []byte(tt.a)
r, _ := utf8.DecodeRuneInString(tt.b)
pos := IndexRune(a, r)
if pos != tt.i {
t.Errorf(`IndexRune(%q, '%c') = %v`, tt.a, r, pos)
tests := []struct {
in string
rune rune
want int
}{
{"", 'a', -1},
{"", '☺', -1},
{"foo", '☹', -1},
{"foo", 'o', 1},
{"foo☺bar", '☺', 3},
{"foo☺☻☹bar", '☹', 9},
{"a A x", 'A', 2},
{"some_text=some_value", '=', 9},
{"☺a", 'a', 3},
{"a☻☺b", '☺', 4},

// RuneError should match any invalid UTF-8 byte sequence.
{"�", '�', 0},
{"\xff", '�', 0},
{"☻x�", '�', len("☻x")},
{"☻x\xe2\x98", '�', len("☻x")},
{"☻x\xe2\x98�", '�', len("☻x")},
{"☻x\xe2\x98x", '�', len("☻x")},

// Invalid rune values should never match.
{"a☺b☻c☹d\xe2\x98\xff\xed\xa0\x80", -1, -1},
{"a☺b☻c☹d\xe2\x98\xff\xed\xa0\x80", 0xD800, -1}, // Surrogate pair
{"a☺b☻c☹d\xe2\x98\xff\xed\xa0\x80", utf8.MaxRune + 1, -1},
}
for _, tt := range tests {
if got := IndexRune([]byte(tt.in), tt.rune); got != tt.want {
t.Errorf("IndexRune(%q, %d) = %v; want %v", tt.in, tt.rune, got, tt.want)
}
}

haystack := []byte("test世界")

allocs := testing.AllocsPerRun(1000, func() {
if i := IndexRune(haystack, 's'); i != 2 {
t.Fatalf("'s' at %d; want 2", i)
Expand All @@ -368,7 +384,7 @@ func TestIndexRune(t *testing.T) {
}
})
if allocs != 0 {
t.Errorf(`expected no allocations, got %f`, allocs)
t.Errorf("expected no allocations, got %f", allocs)
}
}

Expand Down
18 changes: 15 additions & 3 deletions src/strings/strings.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,24 @@ func LastIndex(s, sep string) int {

// IndexRune returns the index of the first instance of the Unicode code point
// r, or -1 if rune is not present in s.
// If r is utf8.RuneError, it returns the first instance of any
// invalid UTF-8 byte sequence.
func IndexRune(s string, r rune) int {
if r < utf8.RuneSelf {
switch {
case 0 <= r && r < utf8.RuneSelf:
return IndexByte(s, byte(r))
case r == utf8.RuneError:
for i, r := range s {
if r == utf8.RuneError {
return i
}
}
return -1
case !utf8.ValidRune(r):
return -1
default:
return Index(s, string(r))
}

return Index(s, string(r))
}

// IndexAny returns the index of the first instance of any Unicode code point
Expand Down
49 changes: 33 additions & 16 deletions src/strings/strings_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,21 +240,39 @@ func TestIndexRandom(t *testing.T) {
}
}

var indexRuneTests = []struct {
s string
rune rune
out int
}{
{"a A x", 'A', 2},
{"some_text=some_value", '=', 9},
{"☺a", 'a', 3},
{"a☻☺b", '☺', 4},
}

func TestIndexRune(t *testing.T) {
for _, test := range indexRuneTests {
if actual := IndexRune(test.s, test.rune); actual != test.out {
t.Errorf("IndexRune(%q,%d)= %v; want %v", test.s, test.rune, actual, test.out)
tests := []struct {
in string
rune rune
want int
}{
{"", 'a', -1},
{"", '☺', -1},
{"foo", '☹', -1},
{"foo", 'o', 1},
{"foo☺bar", '☺', 3},
{"foo☺☻☹bar", '☹', 9},
{"a A x", 'A', 2},
{"some_text=some_value", '=', 9},
{"☺a", 'a', 3},
{"a☻☺b", '☺', 4},

// RuneError should match any invalid UTF-8 byte sequence.
{"�", '�', 0},
{"\xff", '�', 0},
{"☻x�", '�', len("☻x")},
{"☻x\xe2\x98", '�', len("☻x")},
{"☻x\xe2\x98�", '�', len("☻x")},
{"☻x\xe2\x98x", '�', len("☻x")},

// Invalid rune values should never match.
{"a☺b☻c☹d\xe2\x98\xff\xed\xa0\x80", -1, -1},
{"a☺b☻c☹d\xe2\x98\xff\xed\xa0\x80", 0xD800, -1}, // Surrogate pair
{"a☺b☻c☹d\xe2\x98\xff\xed\xa0\x80", utf8.MaxRune + 1, -1},
}
for _, tt := range tests {
if got := IndexRune(tt.in, tt.rune); got != tt.want {
t.Errorf("IndexRune(%q, %d) = %v; want %v", tt.in, tt.rune, got, tt.want)
}
}

Expand All @@ -267,9 +285,8 @@ func TestIndexRune(t *testing.T) {
t.Fatalf("'世' at %d; want 4", i)
}
})

if allocs != 0 {
t.Errorf(`expected no allocations, got %f`, allocs)
t.Errorf("expected no allocations, got %f", allocs)
}
}

Expand Down

0 comments on commit 4b26657

Please sign in to comment.