Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make getindex for String check if indices are valid #22572

Merged
merged 4 commits into from
Sep 20, 2017
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ Breaking changes

This section lists changes that do not have deprecation warnings.

* `getindex(s::String, r::UnitRange{Int})` now throws `UnicodeError` if `last(r)`
is not a valid index into `s` ([#22572]).

* `ntuple(f, n::Integer)` throws `ArgumentError` if `n` is negative.
Previously an empty tuple was returned ([#21697]).

Expand Down
2 changes: 1 addition & 1 deletion base/dates/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ function DateFormat(f::AbstractString, locale::DateLocale=ENGLISH)

letters = String(collect(keys(CONVERSION_SPECIFIERS)))
for m in eachmatch(Regex("(?<!\\\\)([\\Q$letters\\E])\\1*"), f)
tran = replace(f[prev_offset:m.offset - 1], r"\\(.)", s"\1")
tran = replace(f[prev_offset:prevind(f, m.offset)], r"\\(.)", s"\1")

if !isempty(prev)
letter, width = prev
Expand Down
2 changes: 1 addition & 1 deletion base/repl/LineEdit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ function refresh_multi_line(termbuf::TerminalBuffer, terminal::UnixTerminal, buf
# in this case, we haven't yet written the cursor position
line_pos -= slength # '\n' gets an extra pos
if line_pos < 0 || !moreinput
num_chars = (line_pos >= 0 ? llength : strwidth(l[1:(line_pos + slength)]))
num_chars = (line_pos >= 0 ? llength : strwidth(l[1:prevind(l, line_pos + slength + 1)]))
curs_row, curs_pos = divrem(lindent + num_chars - 1, cols)
curs_row += cur_row
curs_pos += 1
Expand Down
2 changes: 1 addition & 1 deletion base/repl/REPL.jl
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ function setup_interface(
end
# Check if input line starts with "julia> ", remove it if we are in prompt paste mode
jl_prompt_len = 7
if (firstline || isprompt_paste) && (oldpos + jl_prompt_len <= sizeof(input) && input[oldpos:oldpos+jl_prompt_len-1] == JULIA_PROMPT)
if (firstline || isprompt_paste) && startswith(SubString(input, oldpos), JULIA_PROMPT)
isprompt_paste = true
oldpos += jl_prompt_len
# If we are prompt pasting and current statement does not begin with julia> , skip to next line
Expand Down
19 changes: 13 additions & 6 deletions base/repl/REPLCompletions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ function completes_global(x, name)
end

function appendmacro!(syms, macros, needle, endchar)
append!(syms, s[2:end-sizeof(needle)]*endchar for s in filter(x -> endswith(x, needle), macros))
r = Regex("^.(.*)$needle\$")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps Regex("^.(.*)\\Q$needle\\E\$") since you don't want special characters to have special interpretation?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure :). I will fix after I am sure that CI goes through correcty (as it seems now will go through without errors).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CI looks fine now, failures are unrelated.

for s in macros
m = match(r, s)
m === nothing || push!(syms, m[1]*endchar)
end
end

function filtered_mod_names(ffunc::Function, mod::Module, name::AbstractString, all::Bool=false, imported::Bool=false)
Expand Down Expand Up @@ -482,9 +486,11 @@ function completions(string, pos)
paths, r, success = complete_path(replace(string[r], r"\\ ", " "), pos)

if inc_tag == :string &&
length(paths) == 1 && # Only close if there's a single choice,
!isdir(expanduser(replace(string[startpos:start(r)-1] * paths[1], r"\\ ", " "))) && # except if it's a directory
(length(string) <= pos || string[pos+1] != '"') # or there's already a " at the cursor.
length(paths) == 1 && # Only close if there's a single choice,
!isdir(expanduser(replace(string[startpos:prevind(string, start(r))] * paths[1],
r"\\ ", " "))) && # except if it's a directory
(length(string) <= pos ||
string[nextind(string,pos)] != '"') # or there's already a " at the cursor.
paths[1] *= "\""
end

Expand Down Expand Up @@ -534,10 +540,11 @@ function completions(string, pos)
# <Mod>/src/<Mod>.jl
# <Mod>.jl/src/<Mod>.jl
if isfile(joinpath(dir, pname))
endswith(pname, ".jl") && push!(suggestions, pname[1:end-3])
endswith(pname, ".jl") && push!(suggestions,
pname[1:prevind(pname, end-2)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should have been OK, since we know the three last characters are ASCII. -2 has no reason to be more correct than -3, right? Same below.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately not:

julia> x="α.jl"
"α.jl"

julia> x[end-3]
ERROR: UnicodeError: invalid character index
Stacktrace:
 [1] slow_utf8_next(::Ptr{UInt8}, ::UInt8, ::Int64, ::Int64) at .\strings\string.jl:172
 [2] next at .\strings\string.jl:204 [inlined]
 [3] getindex(::String, ::Int64) at .\strings\basic.jl:32

I can use end-2 exactly because I know that last three characters are ASCII.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, carry on, I need to learn how to count up to three. :-)

else
mod_name = if endswith(pname, ".jl")
pname[1:end - 3]
pname[1:prevind(pname, end-2)]
else
pname
end
Expand Down
13 changes: 9 additions & 4 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -235,20 +235,25 @@ isvalid(s::String, i::Integer) =

function getindex(s::String, r::UnitRange{Int})
isempty(r) && return ""
i, j = first(r), last(r)
l = sizeof(s)
i = first(r)
if i < 1 || i > l
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a styling issue, but I don't see the point of changing these from if to &&, especially since you add a new if block for UnicodeError.

throw(BoundsError(s, i))
end
@inbounds si = codeunit(s, i)
if is_valid_continuation(si)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better continue throwing a UnicodeError rather than a BoundsError, as the former gives more details about the problematic code point. This can be helpful if you import invalid Unicode data and want to understand what's the problem.

end
j = last(r)
if j > l
throw(BoundsError())
throw(BoundsError(s, j))
end
j = nextind(s,j)-1
unsafe_string(pointer(s,i), j-i+1)
@inbounds sj = codeunit(s, j)
if is_valid_continuation(sj)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, sj))
end
j = nextind(s,j)
unsafe_string(pointer(s,i), j-i)
end

function search(s::String, c::Char, i::Integer = 1)
Expand Down
5 changes: 3 additions & 2 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,10 @@ end
@test first('\x00':'\x7f') === '\x00'
@test last('\x00':'\x7f') === '\x7f'

# make sure substrings handle last code unit even if not start of codepoint
# make sure substrings do not accept code unit if it is not start of codepoint
let s = "x\u0302"
@test s[1:3] == s
@test_throws UnicodeError s[1:3]
@test s[1:2]==s
end

# issue #9781
Expand Down