-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds `check_string` function, which checks a vector of bytes, 16-bit or 32-bit words, or an AbstractString for validity, either for UTF-8, UTF-16, or UTF-32 encoding. By default, `Modified UTF-8 (long \0 encoding)` and `CESU-8 (surrogate pairs encoded as 2 UTF-8 3-byte sequences)` are allowed, but other over long encoded sequences are not allowed, but this can be changed by the keyword options argument. Add unit tests of all the errors found by `check_string` Updated documentation to not use doxygen tags.
- Loading branch information
1 parent
47a0eb1
commit 79f2618
Showing
3 changed files
with
287 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,187 @@ | ||
# This file is a part of Julia. License is MIT: http://julialang.org/license | ||
|
||
## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings, | ||
# and also to return information necessary to convert to other encodings | ||
|
||
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) | ||
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) | ||
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800) | ||
is_valid_continuation(c) = ((c & 0xc0) == 0x80) | ||
|
||
## Options for check_string_* functions | ||
|
||
const UTF_NO_LONG_NULL = 1 ##< don't accept 0xc0 0x80 for '\0' | ||
const UTF_NO_SURROGATES = 2 ##< don't accept surrogate pairs in UTF-8/UTF-32 | ||
const UTF_ACCEPT_LONG = 4 ##< accept long encodings (other than long null in UTF-8) | ||
|
||
const UTF_LONG = 1 ##< Long encodings are present | ||
const UTF_LATIN1 = 2 ##< characters in range 0x80-0xFF present | ||
const UTF_UNICODE2 = 4 ##< characters in range 0x100-0x7ff present | ||
const UTF_UNICODE3 = 8 ##< characters in range 0x800-0xd7ff, 0xe000-0xffff | ||
const UTF_UNICODE4 = 16 ##< non-BMP characters present | ||
const UTF_SURROGATE = 32 ##< surrogate pairs present | ||
|
||
## Get a UTF-8 continuation byte, give error if invalid, return updated character value | ||
@inline function get_continuation(ch::UInt32, byt::UInt8, pos) | ||
!is_valid_continuation(byt) && throw(UnicodeError(UTF_ERR_CONT, pos, byt)) | ||
(ch << 6) | (byt & 0x3f) | ||
end | ||
|
||
#= Validates and calculates number of characters in a UTF-8 encoded vector of UInt8 | ||
### Input Arguments: | ||
* str Vector of UInt8 | ||
### Optional Input Arguments: | ||
* length | ||
* start position | ||
### Keyword Argument: | ||
* options Determine error handling, default is: | ||
** Accept long \0 (Modified UTF-8) | ||
** Accept surrogate pairs in UTF-8 (CESU-8) | ||
** Don't accept other long encodings | ||
### Returns: | ||
(total characters, flags, 4-byte, 3-byte, 2-byte) | ||
### Throws: | ||
UnicodeError | ||
=# | ||
function check_string(dat::Vector{UInt8}, len = sizeof(dat), pos = 0 ; options::Integer=0) | ||
local byt::UInt8, ch::UInt32, surr::UInt32 | ||
flags::UInt = 0 | ||
totalchar = num2byte = num3byte = num4byte = 0 | ||
@inbounds while pos < len | ||
ch = dat[pos += 1] | ||
totalchar += 1 | ||
if ch > 0x7f | ||
# Check UTF-8 encoding | ||
if ch < 0xe0 | ||
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) | ||
(pos == len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) | ||
ch = get_continuation(ch & 0x3f, dat[pos += 1], pos) | ||
if ch > 0x7f | ||
num2byte += 1 | ||
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 | ||
elseif (options & UTF_ACCEPT_LONG) != 0 | ||
flags |= UTF_LONG | ||
elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0) | ||
flags |= UTF_LONG | ||
else | ||
throw(UnicodeError(UTF_ERR_LONG, pos, ch)) | ||
end | ||
elseif ch < 0xf0 | ||
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) | ||
(pos + 2 > len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) | ||
ch = get_continuation(ch & 0x0f, dat[pos += 1], pos) | ||
ch = get_continuation(ch, dat[pos += 1], pos) | ||
# check for surrogate pairs, make sure correct | ||
if is_surrogate_codeunit(ch) | ||
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch)) | ||
# next character *must* be a trailing surrogate character | ||
(pos + 3 > len) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch)) | ||
byt = dat[pos += 1] | ||
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt)) | ||
surr = get_continuation(0x0000d, dat[pos += 1], pos) | ||
surr = get_continuation(surr, dat[pos += 1], pos) | ||
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr)) | ||
(options & UTF_NO_SURROGATES) != 0 && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr)) | ||
flags |= UTF_SURROGATE | ||
num4byte += 1 | ||
elseif ch > 0x07ff | ||
num3byte += 1 | ||
elseif (options & UTF_ACCEPT_LONG) != 0 | ||
flags |= UTF_LONG | ||
num2byte += 1 | ||
else | ||
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch)) | ||
end | ||
elseif ch < 0xf5 | ||
# 4-byte UTF-8 sequence (i.e. characters > 0xffff) | ||
(pos + 3 > len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) | ||
ch = get_continuation(ch & 0x07, dat[pos += 1], pos) | ||
ch = get_continuation(ch, dat[pos += 1], pos) | ||
ch = get_continuation(ch, dat[pos += 1], pos) | ||
if ch > 0x10ffff | ||
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch)) | ||
elseif ch > 0xffff | ||
num4byte += 1 | ||
elseif is_surrogate_codeunit(ch) | ||
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch)) | ||
elseif (options & UTF_ACCEPT_LONG) != 0 | ||
# This is an overly long encoded character | ||
flags |= UTF_LONG | ||
if ch > 0x7ff | ||
num3byte += 1 | ||
elseif ch > 0x7f | ||
num2byte += 1 | ||
end | ||
else | ||
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch)) | ||
end | ||
else | ||
throw(UnicodeError(UTF_ERR_INVALID, pos, ch)) | ||
end | ||
end | ||
end | ||
num3byte != 0 && (flags |= UTF_UNICODE3) | ||
num4byte != 0 && (flags |= UTF_UNICODE4) | ||
return totalchar, flags, num4byte, num3byte, num2byte | ||
end | ||
|
||
#= Validates and calculates number of characters in a UTF-16 or UTF-32 encoded vector/string | ||
### Input Arguments: | ||
* str Vector of UInt16, UInt32, or an AbstractString | ||
### Optional Input Arguments: | ||
* length | ||
* start position | ||
### Keyword Argument: | ||
* options Determines error handling, default is: | ||
** Accept long \0 (Modified UTF-8) | ||
** Accept surrogate pairs in UTF-8 (CESU-8) | ||
** Don't accept other long encodings | ||
### Returns: | ||
(total characters, flags, 4-byte, 3-byte, 2-byte) | ||
### Throws: | ||
UnicodeError | ||
=# | ||
function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)}( | ||
dat::T, | ||
len = endof(dat), | ||
pos = start(dat) | ||
; options::Integer = 0) | ||
local ch::UInt32 | ||
flags::UInt = 0 | ||
totalchar = num2byte = num3byte = num4byte = 0 | ||
@inbounds while pos <= len | ||
ch, pos = next(dat, pos) | ||
totalchar += 1 | ||
if ch > 0x7f | ||
if ch < 0x100 | ||
num2byte += 1 | ||
flags |= UTF_LATIN1 | ||
elseif ch < 0x800 | ||
num2byte += 1 | ||
flags |= UTF_UNICODE2 | ||
elseif T != Vector{UInt16} && ch > 0x0ffff | ||
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch)) | ||
num4byte += 1 | ||
elseif !is_surrogate_codeunit(ch) | ||
num3byte += 1 | ||
elseif is_surrogate_lead(ch) | ||
pos > len && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch)) | ||
# next character *must* be a trailing surrogate character | ||
ch, pos = next(dat, pos) | ||
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch)) | ||
num4byte += 1 | ||
if T != Vector{UInt16} | ||
(options & UTF_NO_SURROGATES) != 0 && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch)) | ||
flags |= UTF_SURROGATE | ||
end | ||
else | ||
throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch)) | ||
end | ||
end | ||
end | ||
num3byte != 0 && (flags |= UTF_UNICODE3) | ||
num4byte != 0 && (flags |= UTF_UNICODE4) | ||
return totalchar, flags, num4byte, num3byte, num2byte | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters