Skip to content

Commit

Permalink
Add Unicode validation function
Browse files Browse the repository at this point in the history
    Adds `check_string` function, which checks a vector of bytes,
    16-bit or 32-bit words, or an AbstractString for validity,
    either for UTF-8, UTF-16, or UTF-32 encoding.
    By default, `Modified UTF-8 (long \0 encoding)` and
    `CESU-8 (surrogate pairs encoded as 2 UTF-8 3-byte sequences)`
    are allowed, but other over long encoded sequences are not allowed,
    but this can be changed by the keyword options argument.
Add unit tests of all the errors found by `check_string`
Updated documentation to not use doxygen tags.
  • Loading branch information
ScottPJones committed Jun 5, 2015
1 parent 47a0eb1 commit 79f2618
Show file tree
Hide file tree
Showing 3 changed files with 287 additions and 0 deletions.
1 change: 1 addition & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ include("osutils.jl")
# strings & printing
include("utferror.jl")
include("utftypes.jl")
include("utfcheck.jl")
include("char.jl")
include("ascii.jl")
include("utf8.jl")
Expand Down
187 changes: 187 additions & 0 deletions base/utfcheck.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
# and also to return information necessary to convert to other encodings

is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
is_valid_continuation(c) = ((c & 0xc0) == 0x80)

## Options for check_string_* functions

const UTF_NO_LONG_NULL = 1 ##< don't accept 0xc0 0x80 for '\0'
const UTF_NO_SURROGATES = 2 ##< don't accept surrogate pairs in UTF-8/UTF-32
const UTF_ACCEPT_LONG = 4 ##< accept long encodings (other than long null in UTF-8)

const UTF_LONG = 1 ##< Long encodings are present
const UTF_LATIN1 = 2 ##< characters in range 0x80-0xFF present
const UTF_UNICODE2 = 4 ##< characters in range 0x100-0x7ff present
const UTF_UNICODE3 = 8 ##< characters in range 0x800-0xd7ff, 0xe000-0xffff
const UTF_UNICODE4 = 16 ##< non-BMP characters present
const UTF_SURROGATE = 32 ##< surrogate pairs present

## Get a UTF-8 continuation byte, give error if invalid, return updated character value
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
!is_valid_continuation(byt) && throw(UnicodeError(UTF_ERR_CONT, pos, byt))
(ch << 6) | (byt & 0x3f)
end

#= Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
### Input Arguments:
* str Vector of UInt8
### Optional Input Arguments:
* length
* start position
### Keyword Argument:
* options Determine error handling, default is:
** Accept long \0 (Modified UTF-8)
** Accept surrogate pairs in UTF-8 (CESU-8)
** Don't accept other long encodings
### Returns:
(total characters, flags, 4-byte, 3-byte, 2-byte)
### Throws:
UnicodeError
=#
function check_string(dat::Vector{UInt8}, len = sizeof(dat), pos = 0 ; options::Integer=0)
local byt::UInt8, ch::UInt32, surr::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
@inbounds while pos < len
ch = dat[pos += 1]
totalchar += 1
if ch > 0x7f
# Check UTF-8 encoding
if ch < 0xe0
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
(pos == len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
if ch > 0x7f
num2byte += 1
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
elseif (options & UTF_ACCEPT_LONG) != 0
flags |= UTF_LONG
elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
flags |= UTF_LONG
else
throw(UnicodeError(UTF_ERR_LONG, pos, ch))
end
elseif ch < 0xf0
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
(pos + 2 > len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
ch = get_continuation(ch, dat[pos += 1], pos)
# check for surrogate pairs, make sure correct
if is_surrogate_codeunit(ch)
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
# next character *must* be a trailing surrogate character
(pos + 3 > len) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
byt = dat[pos += 1]
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
surr = get_continuation(0x0000d, dat[pos += 1], pos)
surr = get_continuation(surr, dat[pos += 1], pos)
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
(options & UTF_NO_SURROGATES) != 0 && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
flags |= UTF_SURROGATE
num4byte += 1
elseif ch > 0x07ff
num3byte += 1
elseif (options & UTF_ACCEPT_LONG) != 0
flags |= UTF_LONG
num2byte += 1
else
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
end
elseif ch < 0xf5
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
(pos + 3 > len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
ch = get_continuation(ch, dat[pos += 1], pos)
ch = get_continuation(ch, dat[pos += 1], pos)
if ch > 0x10ffff
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
elseif ch > 0xffff
num4byte += 1
elseif is_surrogate_codeunit(ch)
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
elseif (options & UTF_ACCEPT_LONG) != 0
# This is an overly long encoded character
flags |= UTF_LONG
if ch > 0x7ff
num3byte += 1
elseif ch > 0x7f
num2byte += 1
end
else
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
end
else
throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
end
end
end
num3byte != 0 && (flags |= UTF_UNICODE3)
num4byte != 0 && (flags |= UTF_UNICODE4)
return totalchar, flags, num4byte, num3byte, num2byte
end

#= Validates and calculates number of characters in a UTF-16 or UTF-32 encoded vector/string
### Input Arguments:
* str Vector of UInt16, UInt32, or an AbstractString
### Optional Input Arguments:
* length
* start position
### Keyword Argument:
* options Determines error handling, default is:
** Accept long \0 (Modified UTF-8)
** Accept surrogate pairs in UTF-8 (CESU-8)
** Don't accept other long encodings
### Returns:
(total characters, flags, 4-byte, 3-byte, 2-byte)
### Throws:
UnicodeError
=#
function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)}(
dat::T,
len = endof(dat),
pos = start(dat)
; options::Integer = 0)
local ch::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
@inbounds while pos <= len
ch, pos = next(dat, pos)
totalchar += 1
if ch > 0x7f
if ch < 0x100
num2byte += 1
flags |= UTF_LATIN1
elseif ch < 0x800
num2byte += 1
flags |= UTF_UNICODE2
elseif T != Vector{UInt16} && ch > 0x0ffff
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
num4byte += 1
elseif !is_surrogate_codeunit(ch)
num3byte += 1
elseif is_surrogate_lead(ch)
pos > len && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
# next character *must* be a trailing surrogate character
ch, pos = next(dat, pos)
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
num4byte += 1
if T != Vector{UInt16}
(options & UTF_NO_SURROGATES) != 0 && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
flags |= UTF_SURROGATE
end
else
throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
end
end
end
num3byte != 0 && (flags |= UTF_UNICODE3)
num4byte != 0 && (flags |= UTF_UNICODE4)
return totalchar, flags, num4byte, num3byte, num2byte
end
99 changes: 99 additions & 0 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1719,3 +1719,102 @@ d = UTF32String(c)
c[1] = 'A'
@test d=="A"

# Test invalid sequences

byt = 0x0
# Continuation byte not after lead
for byt in 0x80:0xbf
@test_throws UnicodeError Base.check_string(UInt8[byt])
end

# Test lead bytes
for byt in 0xc0:0xff
# Single lead byte at end of string
@test_throws UnicodeError Base.check_string(UInt8[byt])
# Lead followed by non-continuation character < 0x80
@test_throws UnicodeError Base.check_string(UInt8[byt,0])
# Lead followed by non-continuation character > 0xbf
@test_throws UnicodeError Base.check_string(UInt8[byt,0xc0])
end

# Test overlong 2-byte
for byt in 0x81:0xbf
@test_throws UnicodeError Base.check_string(UInt8[0xc0,byt])
end
for byt in 0x80:0xbf
@test_throws UnicodeError Base.check_string(UInt8[0xc1,byt])
end

# Test overlong 3-byte
for byt in 0x80:0x9f
@test_throws UnicodeError Base.check_string(UInt8[0xe0,byt,0x80])
end

# Test overlong 4-byte
for byt in 0x80:0x8f
@test_throws UnicodeError Base.check_string(UInt8[0xef,byt,0x80,0x80])
end

# Test 4-byte > 0x10ffff
for byt in 0x90:0xbf
@test_throws UnicodeError Base.check_string(UInt8[0xf4,byt,0x80,0x80])
end
for byt in 0xf5:0xf7
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80])
end

# Test 5-byte
for byt in 0xf8:0xfb
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80,0x80])
end

# Test 6-byte
for byt in 0xfc:0xfd
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80,0x80,0x80])
end

# Test 7-byte
@test_throws UnicodeError Base.check_string(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])

# Three and above byte sequences
for byt in 0xe0:0xef
# Lead followed by only 1 continuation byte
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80])
# Lead ended by non-continuation character < 0x80
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0])
# Lead ended by non-continuation character > 0xbf
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0xc0])
end

# 3-byte encoded surrogate character(s)
# Single surrogate
@test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80])
# Not followed by surrogate
@test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])
# Trailing surrogate first
@test_throws UnicodeError Base.check_string(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])
# Followed by lead surrogate
@test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])

# Four byte sequences
for byt in 0xf0:0xf4
# Lead followed by only 2 continuation bytes
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80])
# Lead followed by non-continuation character < 0x80
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0])
# Lead followed by non-continuation character > 0xbf
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0xc0])
end

# Surrogates
@test_throws UnicodeError Base.check_string(UInt16[0xd800])
@test_throws UnicodeError Base.check_string(UInt16[0xdc00])
@test_throws UnicodeError Base.check_string(UInt16[0xdc00,0xd800])

# Surrogates in UTF-32
@test_throws UnicodeError Base.check_string(UInt32[0xd800])
@test_throws UnicodeError Base.check_string(UInt32[0xdc00])
@test_throws UnicodeError Base.check_string(UInt32[0xdc00,0xd800])

# Characters > 0x10ffff
@test_throws UnicodeError Base.check_string(UInt32[0x110000])

0 comments on commit 79f2618

Please sign in to comment.