Skip to content

Commit

Permalink
Throw UnicodeError for s[i:j] when j is not at the start or the end of a
Browse files Browse the repository at this point in the history
code point
Plus a few tests
fixes JuliaLang#14158
  • Loading branch information
JobJob committed Dec 4, 2015
1 parent 79356b8 commit f814cc5
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 9 deletions.
21 changes: 14 additions & 7 deletions base/unicode/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -101,26 +101,33 @@ sizeof(s::UTF8String) = sizeof(s.data)

lastidx(s::UTF8String) = length(s.data)

isfirstbyte(c::UInt8) = (c & 0xc0) != 0x80 # == !is_valid_continuation(c)

isvalid(s::UTF8String, i::Integer) =
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
(1 <= i <= endof(s.data)) && isfirstbyte(s.data[i])

const empty_utf8 = UTF8String(UInt8[])

function getindex(s::UTF8String, r::UnitRange{Int})
isempty(r) && return empty_utf8
i, j = first(r), last(r)
d = s.data
if i < 1 || i > length(s.data)
if i < 1 || i > length(d)
throw(BoundsError(s, i))
end
if is_valid_continuation(d[i])
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
end
if j > length(d)
throw(BoundsError())
if j < 1 || j > length(d)
throw(BoundsError(s, i))
end
#ensure j is the first or last byte of a character
if isfirstbyte(d[j])
UTF8String(d[i:j + utf8_trailing[d[j]+1]])
elseif j == length(d) || isfirstbyte(s.data[j+1])
UTF8String(d[i:j])
else
throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, d[j]))
end
j = nextind(s,j)-1
UTF8String(d[i:j])
end

function search(s::UTF8String, c::Char, i::Integer)
Expand Down
27 changes: 25 additions & 2 deletions test/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,32 @@ end
# Issue 13332
@test replace("abc", 'b', 2.1) == "a2.1c"

# chomp/chop
@test chomp("foo\n") == "foo"
# chomp/chop for ASCII/UTF8
for lineend in ["\n","\r\n"]
for str in ["foo", "foø", "føo", "", "ø", "\n", ""]
@test chomp(str*lineend) == str
end
end
@test chomp("\r\r\n") == "\r" # || "\r\r" ?

import Base.chomp!
for lineend in ("\n","\r\n")
for str in ("foo", "foø", "føo", "", "ø", "\n", "")
s = str*lineend
chomp!(s)
@test s == str
end
end

let s = "\r\r\n"
chomp!(s)
@test s == "\r" # || "\r\r" ?
end

@test chop("foob") == "foo"
@test chop("fooƀ") == "foo"
@test chop("fooƀä") == "fooƀ"
@test chop("fooƀa") == "fooƀ"

# bytes2hex and hex2bytes
hex_str = "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592"
Expand Down
16 changes: 16 additions & 0 deletions test/unicode/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,22 @@ let str = UTF8String(b"this is a test\xed\x80")
@test convert(UTF8String, b"this is a test\xed\x80\x80") == "this is a test\ud000"
end

let s = UTF8String("🐨🐨")
#each koala is 4 bytes
@test s[1:4] == "🐨"
@test s[1:1] == "🐨"
@test s[1] == '🐨'
@test s[5:8] == "🐨"
@test s[5:5] == "🐨"
@test s[5] == '🐨'
@test_throws UnicodeError s[1:2]
@test_throws UnicodeError s[1:3]
@test_throws UnicodeError s[1:6]
@test_throws UnicodeError s[1:7]
@test_throws UnicodeError s[5:6]
@test_throws UnicodeError s[5:7]
end

## Reverse of UTF8String
@test reverse(UTF8String("")) == ""
@test reverse(UTF8String("a")) == "a"
Expand Down

0 comments on commit f814cc5

Please sign in to comment.