Add Unicode validation function

Adds `check_string` function, which checks a vector of bytes, 16-bit or 32-bit words, or an AbstractString for validity, either for UTF-8, UTF-16, or UTF-32 encoding. By default, `Modified UTF-8 (long \0 encoding)` and `CESU-8 (surrogate pairs encoded as 2 UTF-8 3-byte sequences)` are allowed, but other over long encoded sequences are not allowed, but this can be changed by the keyword options argument. Add unit tests of all the errors found by `check_string` Updated documentation to not use doxygen tags.
JuliaLang · Jun 5, 2015 · 79f2618 · 79f2618
1 parent 47a0eb1
commit 79f2618
Show file tree

Hide file tree

Showing 3 changed files with 287 additions and 0 deletions.
diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -86,6 +86,7 @@ include("osutils.jl")
 # strings & printing
 include("utferror.jl")
 include("utftypes.jl")
+include("utfcheck.jl")
 include("char.jl")
 include("ascii.jl")
 include("utf8.jl")

diff --git a/base/utfcheck.jl b/base/utfcheck.jl
@@ -0,0 +1,187 @@
+# This file is a part of Julia. License is MIT: http://julialang.org/license
+
+## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
+#  and also to return information necessary to convert to other encodings
+
+is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
+is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
+is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
+is_valid_continuation(c) = ((c & 0xc0) == 0x80)
+
+## Options for check_string_* functions
+
+const UTF_NO_LONG_NULL = 1      ##< don't accept 0xc0 0x80 for '\0'
+const UTF_NO_SURROGATES = 2     ##< don't accept surrogate pairs in UTF-8/UTF-32
+const UTF_ACCEPT_LONG = 4       ##< accept long encodings (other than long null in UTF-8)
+
+const UTF_LONG = 1              ##< Long encodings are present
+const UTF_LATIN1 = 2            ##< characters in range 0x80-0xFF present
+const UTF_UNICODE2 = 4          ##< characters in range 0x100-0x7ff present
+const UTF_UNICODE3 = 8          ##< characters in range 0x800-0xd7ff, 0xe000-0xffff
+const UTF_UNICODE4 = 16         ##< non-BMP characters present
+const UTF_SURROGATE = 32        ##< surrogate pairs present
+
+## Get a UTF-8 continuation byte, give error if invalid, return updated character value
+@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
+    !is_valid_continuation(byt) && throw(UnicodeError(UTF_ERR_CONT, pos, byt))
+    (ch << 6) | (byt & 0x3f)
+end
+
+#= Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
+
+### Input Arguments:
+    * str    Vector of UInt8
+### Optional Input Arguments:
+    * length
+    * start position
+### Keyword Argument:
+    * options Determine error handling, default is:
+    **  Accept long \0 (Modified UTF-8)
+    ** Accept surrogate pairs in UTF-8 (CESU-8)
+    ** Don't accept other long encodings
+### Returns:
+    (total characters, flags, 4-byte, 3-byte, 2-byte)
+### Throws:
+    UnicodeError
+=#
+function check_string(dat::Vector{UInt8}, len = sizeof(dat), pos = 0 ; options::Integer=0)
+    local byt::UInt8, ch::UInt32, surr::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    @inbounds while pos < len
+        ch = dat[pos += 1]
+        totalchar += 1
+        if ch > 0x7f
+            # Check UTF-8 encoding
+            if ch < 0xe0
+                # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
+                (pos == len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
+                if ch > 0x7f
+                    num2byte += 1
+                    flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    flags |= UTF_LONG
+                elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
+                    flags |= UTF_LONG
+                else
+                    throw(UnicodeError(UTF_ERR_LONG, pos, ch))
+                end
+             elseif ch < 0xf0
+                # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
+                (pos + 2 > len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                # check for surrogate pairs, make sure correct
+                if is_surrogate_codeunit(ch)
+                    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
+                    # next character *must* be a trailing surrogate character
+                    (pos + 3 > len) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
+                    byt = dat[pos += 1]
+                    (byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
+                    surr = get_continuation(0x0000d, dat[pos += 1], pos)
+                    surr = get_continuation(surr, dat[pos += 1], pos)
+                    !is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
+                    (options & UTF_NO_SURROGATES) != 0 && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
+                    flags |= UTF_SURROGATE
+                    num4byte += 1
+                elseif ch > 0x07ff
+                    num3byte += 1
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    flags |= UTF_LONG
+                    num2byte += 1
+                else
+                    throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
+                end
+            elseif ch < 0xf5
+                # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
+                (pos + 3 > len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
+                ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                ch = get_continuation(ch, dat[pos += 1], pos)
+                if ch > 0x10ffff
+                    throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
+                elseif ch > 0xffff
+                    num4byte += 1
+                elseif is_surrogate_codeunit(ch)
+                    throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
+                elseif (options & UTF_ACCEPT_LONG) != 0
+                    # This is an overly long encoded character
+                    flags |= UTF_LONG
+                    if ch > 0x7ff
+                        num3byte += 1
+                    elseif ch > 0x7f
+                        num2byte += 1
+                    end
+                else
+                    throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
+                end
+            else
+                throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
+
+#= Validates and calculates number of characters in a UTF-16 or UTF-32 encoded vector/string
+
+### Input Arguments:
+    * str    Vector of UInt16, UInt32, or an AbstractString
+### Optional Input Arguments:
+    * length
+    * start position
+### Keyword Argument:
+    * options Determines error handling, default is:
+    **  Accept long \0 (Modified UTF-8)
+    **  Accept surrogate pairs in UTF-8 (CESU-8)
+    **  Don't accept other long encodings
+### Returns:
+    (total characters, flags, 4-byte, 3-byte, 2-byte)
+### Throws:
+    UnicodeError
+=#
+function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)}(
+                      dat::T,
+                      len = endof(dat),
+                      pos = start(dat)
+                      ; options::Integer = 0)
+    local ch::UInt32
+    flags::UInt = 0
+    totalchar = num2byte = num3byte = num4byte = 0
+    @inbounds while pos <= len
+        ch, pos = next(dat, pos)
+        totalchar += 1
+        if ch > 0x7f
+            if ch < 0x100
+                num2byte += 1
+                flags |= UTF_LATIN1
+            elseif ch < 0x800
+                num2byte += 1
+                flags |= UTF_UNICODE2
+            elseif T != Vector{UInt16} && ch > 0x0ffff
+                (ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
+                num4byte += 1
+            elseif !is_surrogate_codeunit(ch)
+                num3byte += 1
+            elseif is_surrogate_lead(ch)
+                pos > len && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
+                # next character *must* be a trailing surrogate character
+                ch, pos = next(dat, pos)
+                !is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
+                num4byte += 1
+                if T != Vector{UInt16}
+                    (options & UTF_NO_SURROGATES) != 0 && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
+                    flags |= UTF_SURROGATE
+                end
+            else
+                throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
+            end
+        end
+    end
+    num3byte != 0 && (flags |= UTF_UNICODE3)
+    num4byte != 0 && (flags |= UTF_UNICODE4)
+    return totalchar, flags, num4byte, num3byte, num2byte
+end
diff --git a/test/strings.jl b/test/strings.jl
@@ -1719,3 +1719,102 @@ d = UTF32String(c)
 c[1] = 'A'
 @test d=="A"
 
+# Test invalid sequences
+
+byt = 0x0
+    # Continuation byte not after lead
+    for byt in 0x80:0xbf
+        @test_throws UnicodeError Base.check_string(UInt8[byt])
+    end
+
+    # Test lead bytes
+    for byt in 0xc0:0xff
+        # Single lead byte at end of string
+        @test_throws UnicodeError Base.check_string(UInt8[byt])
+        # Lead followed by non-continuation character < 0x80
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0])
+        # Lead followed by non-continuation character > 0xbf
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0xc0])
+    end
+
+    # Test overlong 2-byte
+    for byt in 0x81:0xbf
+        @test_throws UnicodeError Base.check_string(UInt8[0xc0,byt])
+    end
+    for byt in 0x80:0xbf
+        @test_throws UnicodeError Base.check_string(UInt8[0xc1,byt])
+    end
+
+    # Test overlong 3-byte
+    for byt in 0x80:0x9f
+        @test_throws UnicodeError Base.check_string(UInt8[0xe0,byt,0x80])
+    end
+
+    # Test overlong 4-byte
+    for byt in 0x80:0x8f
+        @test_throws UnicodeError Base.check_string(UInt8[0xef,byt,0x80,0x80])
+    end
+
+    # Test 4-byte > 0x10ffff
+    for byt in 0x90:0xbf
+        @test_throws UnicodeError Base.check_string(UInt8[0xf4,byt,0x80,0x80])
+    end
+    for byt in 0xf5:0xf7
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80])
+    end
+
+    # Test 5-byte
+    for byt in 0xf8:0xfb
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80,0x80])
+    end
+
+    # Test 6-byte
+    for byt in 0xfc:0xfd
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80,0x80,0x80])
+    end
+
+    # Test 7-byte
+    @test_throws UnicodeError Base.check_string(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])
+
+    # Three and above byte sequences
+    for byt in 0xe0:0xef
+        # Lead followed by only 1 continuation byte
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80])
+        # Lead ended by non-continuation character < 0x80
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0])
+        # Lead ended by non-continuation character > 0xbf
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0xc0])
+    end
+
+    # 3-byte encoded surrogate character(s)
+    # Single surrogate
+    @test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80])
+    # Not followed by surrogate
+    @test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])
+    # Trailing surrogate first
+    @test_throws UnicodeError Base.check_string(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])
+    # Followed by lead surrogate
+    @test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])
+
+    # Four byte sequences
+    for byt in 0xf0:0xf4
+        # Lead followed by only 2 continuation bytes
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80])
+        # Lead followed by non-continuation character < 0x80
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0])
+        # Lead followed by non-continuation character > 0xbf
+        @test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0xc0])
+    end
+
+# Surrogates
+@test_throws UnicodeError Base.check_string(UInt16[0xd800])
+@test_throws UnicodeError Base.check_string(UInt16[0xdc00])
+@test_throws UnicodeError Base.check_string(UInt16[0xdc00,0xd800])
+
+# Surrogates in UTF-32
+@test_throws UnicodeError Base.check_string(UInt32[0xd800])
+@test_throws UnicodeError Base.check_string(UInt32[0xdc00])
+@test_throws UnicodeError Base.check_string(UInt32[0xdc00,0xd800])
+
+# Characters > 0x10ffff
+@test_throws UnicodeError Base.check_string(UInt32[0x110000])