From 71683dffcd015477262a70bd90899bf7cfe50728 Mon Sep 17 00:00:00 2001 From: Simeon David Schaub Date: Fri, 15 Apr 2022 14:51:46 +0200 Subject: [PATCH] support malformed characters, the second Alternative to #44765. This disallows character literals that can not be created from iterating a UTF-8 string. fixes #25072 --- src/ast.c | 9 ++++++ src/flisp/flisp.c | 1 + src/flisp/flisp.h | 1 + src/flisp/julia_extensions.c | 50 +++++++++++++++++++++++++++++++++ src/julia-parser.scm | 13 ++++----- src/julia_internal.h | 1 + test/syntax.jl | 54 +++++++++++++++++++++++------------- 7 files changed, 103 insertions(+), 26 deletions(-) diff --git a/src/ast.c b/src/ast.c index 14a6e21e54bbe..6253350392d7d 100644 --- a/src/ast.c +++ b/src/ast.c @@ -112,6 +112,7 @@ JL_DLLEXPORT jl_sym_t *jl_acquire_sym; JL_DLLEXPORT jl_sym_t *jl_release_sym; JL_DLLEXPORT jl_sym_t *jl_acquire_release_sym; JL_DLLEXPORT jl_sym_t *jl_sequentially_consistent_sym; +JL_DLLEXPORT jl_sym_t *jl_julia_char_sym; static const uint8_t flisp_system_image[] = { @@ -375,6 +376,7 @@ void jl_init_common_symbols(void) jl_release_sym = jl_symbol("release"); jl_acquire_release_sym = jl_symbol("acquire_release"); jl_sequentially_consistent_sym = jl_symbol("sequentially_consistent"); + jl_julia_char_sym = jl_symbol("julia_char"); } JL_DLLEXPORT void jl_lisp_prompt(void) @@ -584,6 +586,13 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m ex = scm_to_julia_(fl_ctx, car_(e), mod); temp = jl_new_struct(jl_quotenode_type, ex); } + else if (sym == jl_julia_char_sym) { + value_t v = car_(e); + if (!(iscprim(v) && cp_class((cprim_t*)ptr(v)) == fl_ctx->uint32type)) + jl_error("malformed julia char"); + uint32_t c = *(uint32_t*)cp_data((cprim_t*)ptr(v)); + temp = jl_box_char(c); + } if (temp) { JL_GC_POP(); return temp; diff --git a/src/flisp/flisp.c b/src/flisp/flisp.c index 86421f6d966cf..32c0008025559 100644 --- a/src/flisp/flisp.c +++ b/src/flisp/flisp.c @@ -2396,6 +2396,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize) #endif fl_ctx->jl_sym = symbol(fl_ctx, "julia_value"); + fl_ctx->jl_char_sym = symbol(fl_ctx, "julia_char"); fl_ctx->the_empty_vector = tagptr(alloc_words(fl_ctx, 1), TAG_VECTOR); vector_setsize(fl_ctx->the_empty_vector, 0); diff --git a/src/flisp/flisp.h b/src/flisp/flisp.h index 209a4f2d4fcdb..e77904a32d1f2 100644 --- a/src/flisp/flisp.h +++ b/src/flisp/flisp.h @@ -502,6 +502,7 @@ struct _fl_context_t { value_t apply_func, apply_v, apply_e; value_t jl_sym; + value_t jl_char_sym; // persistent buffer (avoid repeated malloc/free) // for julia_extensions.c: normalize size_t jlbuflen; diff --git a/src/flisp/julia_extensions.c b/src/flisp/julia_extensions.c index 9fcd3e9789af4..f29e3972755c5 100644 --- a/src/flisp/julia_extensions.c +++ b/src/flisp/julia_extensions.c @@ -361,6 +361,55 @@ value_t fl_string2normsymbol(fl_context_t *fl_ctx, value_t *args, uint32_t nargs return symbol(fl_ctx, normalize(fl_ctx, (char*)cvalue_data(args[0]))); } +static uint32_t _iterate_continued(uint8_t *s, size_t n, size_t *i, uint32_t u) { + if (u < 0xc0000000) { ++*i; return u; } + uint8_t b; + + if (++*i >= n) return u; + b = s[*i]; // cont byte 1 + if ((b & 0xc0) != 0x80) return u; + u |= (uint32_t)b << 16; + + if (++*i >= n || u < 0xe0000000) return u; + b = s[*i]; // cont byte 2 + if ((b & 0xc0) != 0x80) return u; + u |= (uint32_t)b << 8; + + if (++*i >= n || u < 0xf0000000) return u; + b = s[*i]; // cont byte 3 + if ((b & 0xc0) != 0x80) return u; + u |= (uint32_t)b; ++*i; + + return u; +} + +static uint32_t _string_only_julia_char(uint8_t *s, size_t n) { + if (!(0 < n && n <= 4)) + return -1; + size_t i = 0; + uint8_t b = s[i]; + uint32_t u = (uint32_t)b << 24; + if (0x80 <= b && b <= 0xf7) + u = _iterate_continued(s, n, &i, u); + else + i = 1; + if (i < n) + return -1; + return u; +} + +value_t fl_string_only_julia_char(fl_context_t *fl_ctx, value_t *args, uint32_t nargs) { + argcount(fl_ctx, "string.only-julia-char", nargs, 1); + if (!fl_isstring(fl_ctx, args[0])) + type_error(fl_ctx, "string.only-julia-char", "string", args[0]); + uint8_t *s = (uint8_t*)cvalue_data(args[0]); + size_t len = cv_len((cvalue_t*)ptr(args[0])); + uint32_t u = _string_only_julia_char(s, len); + if (u == (uint32_t)-1) + return fl_ctx->F; + return fl_list2(fl_ctx, fl_ctx->jl_char_sym, mk_uint32(fl_ctx, u)); +} + static const builtinspec_t julia_flisp_func_info[] = { { "skip-ws", fl_skipws }, { "accum-julia-symbol", fl_accum_julia_symbol }, @@ -371,6 +420,7 @@ static const builtinspec_t julia_flisp_func_info[] = { { "strip-op-suffix", fl_julia_strip_op_suffix }, { "underscore-symbol?", fl_julia_underscore_symbolp }, { "string->normsymbol", fl_string2normsymbol }, + { "string.only-julia-char", fl_string_only_julia_char }, { NULL, NULL } }; diff --git a/src/julia-parser.scm b/src/julia-parser.scm index 97a11df701a37..5a46f280c64f8 100644 --- a/src/julia-parser.scm +++ b/src/julia-parser.scm @@ -2481,13 +2481,12 @@ (write-char (not-eof-1 (read-char (ts:port s))) b)) (loop (read-char (ts:port s)))))) - (let ((str (unescape-string (io.tostring! b)))) - (let ((len (string-length str))) - (if (= len 1) - (string.char str 0) - (if (= len 0) - (error "invalid empty character literal") - (error "character literal contains multiple characters"))))))))) + (let* ((str (unescape-string (io.tostring! b))) + (c (string.only-julia-char str))) + (or c + (if (= (string-length str) 0) + (error "invalid empty character literal") + (error "character literal contains multiple characters")))))))) ;; symbol/expression quote ((eq? t ':) diff --git a/src/julia_internal.h b/src/julia_internal.h index 451e07eb9e3df..b0c16a0cd0dea 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -1472,6 +1472,7 @@ extern JL_DLLEXPORT jl_sym_t *jl_acquire_sym; extern JL_DLLEXPORT jl_sym_t *jl_release_sym; extern JL_DLLEXPORT jl_sym_t *jl_acquire_release_sym; extern JL_DLLEXPORT jl_sym_t *jl_sequentially_consistent_sym; +extern JL_DLLEXPORT jl_sym_t *jl_julia_char_sym; JL_DLLEXPORT enum jl_memory_order jl_get_atomic_order(jl_sym_t *order, char loading, char storing); JL_DLLEXPORT enum jl_memory_order jl_get_atomic_order_checked(jl_sym_t *order, char loading, char storing); diff --git a/test/syntax.jl b/test/syntax.jl index 36e4f0745bafc..ff11eff85381d 100644 --- a/test/syntax.jl +++ b/test/syntax.jl @@ -276,9 +276,6 @@ end @test Meta.parse("'\"'") == Meta.parse("'\\\"'") == '"' == "\""[1] == '\42' # issue #24558 -@test_throws ParseError Meta.parse("'\\xff'") -@test_throws ParseError Meta.parse("'\\x80'") -@test_throws ParseError Meta.parse("'ab'") @test '\u2200' == "\u2200"[1] @test_throws ParseError Meta.parse("f(2x for x=1:10, y") @@ -317,19 +314,16 @@ let p = 15 @test 2p+1 == 31 # not a hex float literal end -function test_parseerror(str, msg) - try - Meta.parse(str) - @test false - catch e - @test isa(e,ParseError) && e.msg == msg - end +macro test_parseerror(str, msg) + ex = :(@test_throws ParseError($(esc(msg))) Meta.parse($(esc(str)))) + ex.args[2] = __source__ + return ex end -test_parseerror("0x", "invalid numeric constant \"0x\"") -test_parseerror("0b", "invalid numeric constant \"0b\"") -test_parseerror("0o", "invalid numeric constant \"0o\"") -test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"") -test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") +@test_parseerror("0x", "invalid numeric constant \"0x\"") +@test_parseerror("0b", "invalid numeric constant \"0b\"") +@test_parseerror("0o", "invalid numeric constant \"0o\"") +@test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"") +@test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") # issue #15798 @test Meta.lower(Main, Base.parse_input_line(""" @@ -345,8 +339,8 @@ test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") """)::Expr) == 23341 # issue #15763 -test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1") -test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2") +@test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1") +@test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2") # issue #15828 @test Meta.lower(Main, Meta.parse("x...")) == Expr(:error, "\"...\" expression outside call") @@ -2059,8 +2053,8 @@ end == 1 # issue #29982 @test Meta.parse("'a'") == 'a' @test Meta.parse("'\U0061'") == 'a' -test_parseerror("''", "invalid empty character literal") -test_parseerror("'abc'", "character literal contains multiple characters") +@test_parseerror("''", "invalid empty character literal") +@test_parseerror("'abc'", "character literal contains multiple characters") # optional soft scope: #28789, #33864 @@ -3357,3 +3351,25 @@ demo44723()::Any = Base.Experimental.@opaque () -> true ? 1 : 2 @test y == Core.svec(2, 3) @test z == 4 end + +@testset "issue 25072" begin + @test '\xc0\x80' == reinterpret(Char, 0xc0800000) + @test '\x80' == reinterpret(Char, 0x80000000) + @test '\xff' == reinterpret(Char, 0xff000000) + @test_parseerror "'\\xff\\xff\\xff\\xff'" "character literal contains multiple characters" # == reinterpret(Char, 0xffffffff) + @test '\uffff' == Char(0xffff) + @test '\U00002014' == Char(0x2014) + @test '\100' == reinterpret(Char, UInt32(0o100) << 24) + @test_parseerror "'\\100\\42'" "character literal contains multiple characters" # == reinterpret(Char, (UInt32(0o100) << 24) | (UInt32(0o42) << 16)) + @test_parseerror "''" "invalid empty character literal" + @test_parseerror "'\\xff\\xff\\xff\\xff\\xff'" "character literal contains multiple characters" + @test_parseerror "'abcd'" "character literal contains multiple characters" + @test_parseerror "'\\uff\\xff'" "character literal contains multiple characters" + @test_parseerror "'\\xff\\uff'" "character literal contains multiple characters" + @test_parseerror "'\\xffa'" "character literal contains multiple characters" + @test_parseerror "'\\uffffa'" "character literal contains multiple characters" + @test_parseerror "'\\U00002014a'" "character literal contains multiple characters" + @test_parseerror "'\\1000'" "character literal contains multiple characters" + @test Meta.isexpr(Meta.parse("'a"), :incomplete) + @test ''' == "'"[1] +end