Skip to content

Commit

Permalink
properly support malformed char literals
Browse files Browse the repository at this point in the history
Fixes the parsing of char literals like `'\xc0\x80'`. At first, I tried
to replicate the behavior of `getindex` on a string in Julia here, but
then I noticed that we probably also want to support cases like
`'\xff\xff'`, which would give two characters in a Julia string. Now
this supports any combination of characters as long as they are between
1 and 4 bytes, so even literals like `'abcd'` are allowed. I think this
makes sense because otherwise we wouldn't be able to reparse every valid
char in Julia, but I could see Python users being confused about why
Julia only supports strings up to length 4... 😄

fixes #25072
  • Loading branch information
simeonschaub committed Mar 27, 2022
1 parent 19eb307 commit 6c955c6
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 10 deletions.
9 changes: 9 additions & 0 deletions src/ast.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ JL_DLLEXPORT jl_sym_t *jl_acquire_sym;
JL_DLLEXPORT jl_sym_t *jl_release_sym;
JL_DLLEXPORT jl_sym_t *jl_acquire_release_sym;
JL_DLLEXPORT jl_sym_t *jl_sequentially_consistent_sym;
JL_DLLEXPORT jl_sym_t *jl_julia_char_sym;


static const uint8_t flisp_system_image[] = {
Expand Down Expand Up @@ -366,6 +367,7 @@ void jl_init_common_symbols(void)
jl_release_sym = jl_symbol("release");
jl_acquire_release_sym = jl_symbol("acquire_release");
jl_sequentially_consistent_sym = jl_symbol("sequentially_consistent");
jl_julia_char_sym = jl_symbol("julia_char");
}

JL_DLLEXPORT void jl_lisp_prompt(void)
Expand Down Expand Up @@ -575,6 +577,13 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m
ex = scm_to_julia_(fl_ctx, car_(e), mod);
temp = jl_new_struct(jl_quotenode_type, ex);
}
else if (sym == jl_julia_char_sym) {
value_t v = car_(e);
if (!(iscprim(v) && cp_class((cprim_t*)ptr(v)) == fl_ctx->uint32type))
jl_error("malformed julia char");
uint32_t c = *(uint32_t*)cp_data((cprim_t*)ptr(v));
temp = jl_box_char(c);
}
if (temp) {
JL_GC_POP();
return temp;
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2402,6 +2402,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
#endif

fl_ctx->jl_sym = symbol(fl_ctx, "julia_value");
fl_ctx->jl_char_sym = symbol(fl_ctx, "julia_char");

fl_ctx->the_empty_vector = tagptr(alloc_words(fl_ctx, 1), TAG_VECTOR);
vector_setsize(fl_ctx->the_empty_vector, 0);
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ struct _fl_context_t {
value_t apply_func, apply_v, apply_e;

value_t jl_sym;
value_t jl_char_sym;
// persistent buffer (avoid repeated malloc/free)
// for julia_extensions.c: normalize
size_t jlbuflen;
Expand Down
25 changes: 25 additions & 0 deletions src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,30 @@ value_t fl_string2normsymbol(fl_context_t *fl_ctx, value_t *args, uint32_t nargs
return symbol(fl_ctx, normalize(fl_ctx, (char*)cvalue_data(args[0])));
}

// Return the uint32 representation if the string can be represented as a single Julia `Char`
// object. Otherwise return false. Note that it does allow for overlong chars like 'abcd', as
// long as they don't exceed 4 bytes
value_t fl_string_only_julia_char(fl_context_t *fl_ctx, value_t *args, uint32_t nargs) {
argcount(fl_ctx, "string.only-julia-char", nargs, 1);
if (!fl_isstring(fl_ctx, args[0]))
type_error(fl_ctx, "string.only-julia-char", "string", args[0]);
uint8_t *s = (uint8_t*)cvalue_data(args[0]);
size_t len = cv_len((cvalue_t*)ptr(args[0]));
if (!(0 < len && len <= 4))
return fl_ctx->F;

uint32_t u = (uint32_t)s[0] << 24;
if (len == 1) goto ret;
u |= (uint32_t)s[1] << 16;
if (len == 2) goto ret;
u |= (uint32_t)s[2] << 8;
if (len == 3) goto ret;
u |= (uint32_t)s[3];

ret:
return fl_list2(fl_ctx, fl_ctx->jl_char_sym, mk_uint32(fl_ctx, u));
}

static const builtinspec_t julia_flisp_func_info[] = {
{ "skip-ws", fl_skipws },
{ "accum-julia-symbol", fl_accum_julia_symbol },
Expand All @@ -371,6 +395,7 @@ static const builtinspec_t julia_flisp_func_info[] = {
{ "strip-op-suffix", fl_julia_strip_op_suffix },
{ "underscore-symbol?", fl_julia_underscore_symbolp },
{ "string->normsymbol", fl_string2normsymbol },
{ "string.only-julia-char", fl_string_only_julia_char },
{ NULL, NULL }
};

Expand Down
13 changes: 6 additions & 7 deletions src/julia-parser.scm
Original file line number Diff line number Diff line change
Expand Up @@ -2481,13 +2481,12 @@
(write-char (not-eof-1 (read-char (ts:port s)))
b))
(loop (read-char (ts:port s))))))
(let ((str (unescape-string (io.tostring! b))))
(let ((len (string-length str)))
(if (= len 1)
(string.char str 0)
(if (= len 0)
(error "invalid empty character literal")
(error "character literal contains multiple characters")))))))))
(let* ((str (unescape-string (io.tostring! b)))
(c (string.only-julia-char str)))
(or c
(if (= (string-length str) 0)
(error "invalid empty character literal")
(error "character literal contains multiple characters"))))))))

;; symbol/expression quote
((eq? t ':)
Expand Down
1 change: 1 addition & 0 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1476,6 +1476,7 @@ extern JL_DLLEXPORT jl_sym_t *jl_acquire_sym;
extern JL_DLLEXPORT jl_sym_t *jl_release_sym;
extern JL_DLLEXPORT jl_sym_t *jl_acquire_release_sym;
extern JL_DLLEXPORT jl_sym_t *jl_sequentially_consistent_sym;
extern JL_DLLEXPORT jl_sym_t *jl_julia_char_sym;

JL_DLLEXPORT enum jl_memory_order jl_get_atomic_order(jl_sym_t *order, char loading, char storing);
JL_DLLEXPORT enum jl_memory_order jl_get_atomic_order_checked(jl_sym_t *order, char loading, char storing);
Expand Down
13 changes: 10 additions & 3 deletions test/syntax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,6 @@ end
@test Meta.parse("'\"'") == Meta.parse("'\\\"'") == '"' == "\""[1] == '\42'

# issue #24558
@test_throws ParseError Meta.parse("'\\xff'")
@test_throws ParseError Meta.parse("'\\x80'")
@test_throws ParseError Meta.parse("'ab'")
@test '\u2200' == "\u2200"[1]

@test_throws ParseError Meta.parse("f(2x for x=1:10, y")
Expand Down Expand Up @@ -3280,3 +3277,13 @@ end
# issue 44723
demo44723()::Any = Base.Experimental.@opaque () -> true ? 1 : 2
@test demo44723()() == 1

@testset "issue 25072" begin
@test '\xc0\x80' == reinterpret(Char, 0xc0800000)
@test '\x80' == reinterpret(Char, 0x80000000)
@test '\xff' == reinterpret(Char, 0xff000000)
@test '\xff\xff\xff\xff' == reinterpret(Char, 0xffffffff)
@test 'abcd' == reinterpret(Char, 0x61626364)
@test_throws ParseError Meta.parse("''")
@test_throws ParseError Meta.parse("'\\xff\\xff\\xff\\xff\\xff'")
end

0 comments on commit 6c955c6

Please sign in to comment.