From 627173bcc986e4cd17a2e8d7ded7d977fbe68694 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 18 May 2018 16:03:29 +0200 Subject: [PATCH] Enable PCRE UTF-8 validity string checks (#26731) Strings are no guaranteed to contain valid UTF-8, and PCRE documentation says that the behavior is undefined in that case. --- base/regex.jl | 4 ++-- test/regex.jl | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/base/regex.jl b/base/regex.jl index 11327e8d6741d..32d93275997c8 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -4,8 +4,8 @@ include("pcre.jl") -const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.NO_UTF_CHECK | PCRE.ALT_BSUX -const DEFAULT_MATCH_OPTS = PCRE.NO_UTF_CHECK +const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX +const DEFAULT_MATCH_OPTS = zero(UInt32) mutable struct Regex pattern::String diff --git a/test/regex.jl b/test/regex.jl index bd0e87e9a8f87..fe5ce3c7f58bd 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -57,3 +57,19 @@ end # Proper unicode handling @test match(r"∀∀", "∀x∀∀∀").match == "∀∀" + +@test_throws ErrorException match(r"a", "\xe2\x88") # 1 byte missing at end +@test_throws ErrorException match(r"a", "\xe2\x08\x80") # byte 2 top bits not 0x80 +@test_throws ErrorException match(r"a", "\xf8\x89\x89\x80\x80") # 5-byte character is not allowed (RFC 3629) +@test_throws ErrorException match(r"a", "\xf4\x9f\xbf\xbf") # code points greater than 0x10ffff are not defined +@test_throws ErrorException match(r"a", "\Udfff") # code points 0xd800-0xdfff are not defined +@test_throws ErrorException match(r"a", "\xc0\x80") # overlong 2-byte sequence +@test_throws ErrorException match(r"a", "\xff") # illegal byte (0xfe or 0xff) + +@test_throws ErrorException Regex("\xe2\x88") # 1 byte missing at end +@test_throws ErrorException Regex("\xe2\x08\x80") # byte 2 top bits not 0x80 +@test_throws ErrorException Regex("\xf8\x89\x89\x80\x80") # 5-byte character is not allowed (RFC 3629) +@test_throws ErrorException Regex("\xf4\x9f\xbf\xbf") # code points greater than 0x10ffff are not defined +@test_throws ErrorException Regex("\Udfff") # code points 0xd800-0xdfff are not defined +@test_throws ErrorException Regex("\xc0\x80") # overlong 2-byte sequence +@test_throws ErrorException Regex("\xff") # illegal byte (0xfe or 0xff)