Throw UnicodeError for s[i:j] when j is not at the start or the end of a

code point Plus a few tests fixes JuliaLang#14158
JobJob · Dec 4, 2015 · f814cc5 · f814cc5
1 parent 79356b8
commit f814cc5
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 9 deletions.
diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl
@@ -101,26 +101,33 @@ sizeof(s::UTF8String) = sizeof(s.data)
 
 lastidx(s::UTF8String) = length(s.data)
 
+isfirstbyte(c::UInt8) = (c & 0xc0) != 0x80 # == !is_valid_continuation(c)
+
 isvalid(s::UTF8String, i::Integer) =
-    (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
+    (1 <= i <= endof(s.data)) && isfirstbyte(s.data[i])
 
 const empty_utf8 = UTF8String(UInt8[])
-
 function getindex(s::UTF8String, r::UnitRange{Int})
     isempty(r) && return empty_utf8
     i, j = first(r), last(r)
     d = s.data
-    if i < 1 || i > length(s.data)
+    if i < 1 || i > length(d)
         throw(BoundsError(s, i))
     end
     if is_valid_continuation(d[i])
         throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
     end
-    if j > length(d)
-        throw(BoundsError())
+    if j < 1 || j > length(d)
+        throw(BoundsError(s, i))
+    end
+    #ensure j is the first or last byte of a character
+    if isfirstbyte(d[j])
+        UTF8String(d[i:j + utf8_trailing[d[j]+1]])
+    elseif j == length(d) || isfirstbyte(s.data[j+1])
+        UTF8String(d[i:j])
+    else
+        throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, d[j]))
     end
-    j = nextind(s,j)-1
-    UTF8String(d[i:j])
 end
 
 function search(s::UTF8String, c::Char, i::Integer)

diff --git a/test/strings/util.jl b/test/strings/util.jl
@@ -208,9 +208,32 @@ end
 # Issue 13332
 @test replace("abc", 'b', 2.1) == "a2.1c"
 
-# chomp/chop
-@test chomp("foo\n") == "foo"
+# chomp/chop for ASCII/UTF8
+for lineend in ["\n","\r\n"]
+    for str in ["foo", "foø", "føo", "fø", "ø", "\n", ""]
+        @test chomp(str*lineend) == str
+    end
+end
+@test chomp("\r\r\n") == "\r" # || "\r\r" ?
+
+import Base.chomp!
+for lineend in ("\n","\r\n")
+    for str in ("foo", "foø", "føo", "fø", "ø", "\n", "")
+        s = str*lineend
+        chomp!(s)
+        @test s == str
+    end
+end
+
+let s = "\r\r\n"
+    chomp!(s)
+    @test s == "\r" # || "\r\r" ?
+end
+
 @test chop("foob") == "foo"
+@test chop("fooƀ") == "foo"
+@test chop("fooƀä") == "fooƀ"
+@test chop("fooƀa") == "fooƀ"
 
 # bytes2hex and hex2bytes
 hex_str = "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592"

diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl
@@ -26,6 +26,22 @@ let str = UTF8String(b"this is a test\xed\x80")
     @test convert(UTF8String, b"this is a test\xed\x80\x80") == "this is a test\ud000"
 end
 
+let s = UTF8String("🐨🐨")
+    #each koala is 4 bytes
+    @test s[1:4] == "🐨"
+    @test s[1:1] == "🐨"
+    @test s[1] == '🐨'
+    @test s[5:8] == "🐨"
+    @test s[5:5] == "🐨"
+    @test s[5] == '🐨'
+    @test_throws UnicodeError s[1:2]
+    @test_throws UnicodeError s[1:3]
+    @test_throws UnicodeError s[1:6]
+    @test_throws UnicodeError s[1:7]
+    @test_throws UnicodeError s[5:6]
+    @test_throws UnicodeError s[5:7]
+end
+
 ## Reverse of UTF8String
 @test reverse(UTF8String("")) == ""
 @test reverse(UTF8String("a")) == "a"