diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl index 5f278c0e18b4b..4936a8737c2e2 100644 --- a/base/unicode/utf8.jl +++ b/base/unicode/utf8.jl @@ -101,26 +101,33 @@ sizeof(s::UTF8String) = sizeof(s.data) lastidx(s::UTF8String) = length(s.data) +isfirstbyte(c::UInt8) = (c & 0xc0) != 0x80 # == !is_valid_continuation(c) + isvalid(s::UTF8String, i::Integer) = - (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) + (1 <= i <= endof(s.data)) && isfirstbyte(s.data[i]) const empty_utf8 = UTF8String(UInt8[]) - function getindex(s::UTF8String, r::UnitRange{Int}) isempty(r) && return empty_utf8 i, j = first(r), last(r) d = s.data - if i < 1 || i > length(s.data) + if i < 1 || i > length(d) throw(BoundsError(s, i)) end if is_valid_continuation(d[i]) throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) end - if j > length(d) - throw(BoundsError()) + if j < 1 || j > length(d) + throw(BoundsError(s, i)) + end + #ensure j is the first or last byte of a character + if isfirstbyte(d[j]) + UTF8String(d[i:j + utf8_trailing[d[j]+1]]) + elseif j == length(d) || isfirstbyte(s.data[j+1]) + UTF8String(d[i:j]) + else + throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, d[j])) end - j = nextind(s,j)-1 - UTF8String(d[i:j]) end function search(s::UTF8String, c::Char, i::Integer) diff --git a/test/strings/util.jl b/test/strings/util.jl index d3a4e92034bf6..d80eecb58fb85 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -208,9 +208,32 @@ end # Issue 13332 @test replace("abc", 'b', 2.1) == "a2.1c" -# chomp/chop -@test chomp("foo\n") == "foo" +# chomp/chop for ASCII/UTF8 +for lineend in ["\n","\r\n"] + for str in ["foo", "foø", "føo", "fø", "ø", "\n", ""] + @test chomp(str*lineend) == str + end +end +@test chomp("\r\r\n") == "\r" # || "\r\r" ? + +import Base.chomp! +for lineend in ("\n","\r\n") + for str in ("foo", "foø", "føo", "fø", "ø", "\n", "") + s = str*lineend + chomp!(s) + @test s == str + end +end + +let s = "\r\r\n" + chomp!(s) + @test s == "\r" # || "\r\r" ? +end + @test chop("foob") == "foo" +@test chop("fooƀ") == "foo" +@test chop("fooƀä") == "fooƀ" +@test chop("fooƀa") == "fooƀ" # bytes2hex and hex2bytes hex_str = "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592" diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl index 591cb961e404e..8aec62e06239e 100644 --- a/test/unicode/utf8.jl +++ b/test/unicode/utf8.jl @@ -26,6 +26,22 @@ let str = UTF8String(b"this is a test\xed\x80") @test convert(UTF8String, b"this is a test\xed\x80\x80") == "this is a test\ud000" end +let s = UTF8String("🐨🐨") + #each koala is 4 bytes + @test s[1:4] == "🐨" + @test s[1:1] == "🐨" + @test s[1] == '🐨' + @test s[5:8] == "🐨" + @test s[5:5] == "🐨" + @test s[5] == '🐨' + @test_throws UnicodeError s[1:2] + @test_throws UnicodeError s[1:3] + @test_throws UnicodeError s[1:6] + @test_throws UnicodeError s[1:7] + @test_throws UnicodeError s[5:6] + @test_throws UnicodeError s[5:7] +end + ## Reverse of UTF8String @test reverse(UTF8String("")) == "" @test reverse(UTF8String("a")) == "a"