From 52a67dbb6a6d7d1b3112eadca61f0fb8a6bd08f3 Mon Sep 17 00:00:00 2001 From: Oldes Date: Mon, 7 Dec 2020 18:21:46 +0100 Subject: [PATCH] FEAT: Allow line-comments inside binary value specification. resolves: https://github.com/Oldes/Rebol-wishes/issues/23 --- src/core/l-scan.c | 88 ++++++++++++++++++++++++++++++++++- src/core/l-types.c | 23 ++++++--- src/tests/units/lexer-test.r3 | 24 ++++++++++ 3 files changed, 127 insertions(+), 8 deletions(-) diff --git a/src/core/l-scan.c b/src/core/l-scan.c index 10413b9e68..70d7fcddfc 100644 --- a/src/core/l-scan.c +++ b/src/core/l-scan.c @@ -473,6 +473,84 @@ return src; } +/*********************************************************************** +** +*/ const REBYTE *Scan_Quote_Binary(const REBYTE *src, SCAN_STATE *scan_state) +/* +** Scan a binary string, remove spaceces and comments. +** +** The result will be put into the temporary MOLD_BUF binary. +** +***********************************************************************/ +{ + REBOOL comm = FALSE; + REBINT chr; + REBCNT lines = 0; + REBSER *buf = BUF_MOLD; + + RESET_TAIL(buf); + + if (*src++ != '{') return 0; + + while (*src != '}') { + chr = *src; + + switch (chr) { + + case 0: + return 0; // Scan_state shows error location. + case '^': + chr = Scan_Char(&src); + if (chr == -1) return 0; + src--; + break; + case ';': + while (chr != 0) { + chr = *++src; + if (chr == '^') { + chr = Scan_Char(&src); + if (chr == -1) return 0; + src--; + } + if (chr == LF || chr == CR) { + goto new_line; + } + } + return 0; // end of input reached + case CR: + if (src[1] == LF) src++; + // fall thru + case LF: +new_line: + lines++; + // fall thru + case ' ': + case TAB: + src++; + continue; + + default: + if (chr >= 0x80) return 0; + } + + src++; + + if (SERIES_FULL(buf)) + Extend_Series(buf, 1); + + *BIN_SKIP(buf, buf->tail) = chr; + buf->tail++; + } + + src++; // Skip ending quote or brace. + + if (scan_state) scan_state->line_count += lines; + + STR_TERM(buf); + + return src; +} + /*********************************************************************** ** @@ -929,7 +1007,11 @@ if (*cp == '{') { /* BINARY #{12343132023902902302938290382} */ scan_state->end = scan_state->begin; /* save start */ scan_state->begin = cp; - cp = Scan_Quote(cp, scan_state); // stores result string in BUF_MOLD !!?? + // Originally there was used Scan_Quote collecting into BUF_MOLD, but this was not used later. + // It was wasting resources, because Scan_Quote collects unicode (2 bytes per char). + // Scan_Quote_Binary collects ANSI and report invalit input (like unicode char) much sooner. + // It also skips spaces and line-comments so these should not have to be tested by Decode_Binary later. + cp = Scan_Quote_Binary(cp, scan_state); // stores result string in BUF_MOLD !!?? scan_state->begin = scan_state->end; /* restore start */ if (cp) { scan_state->end = cp; @@ -1436,7 +1518,9 @@ extern REBSER *Scan_Full_Block(SCAN_STATE *scan_state, REBYTE mode_char); break; case TOKEN_BINARY: - Scan_Binary(bp, len, value); + // In BUF_MOLD is preprocessed ANSI result without comments and spaces + // we just still need to resolve the binary base (like `64#{`) from the input + Scan_Binary(Scan_Binary_Base(bp, len), BIN_DATA(BUF_MOLD), BIN_LEN(BUF_MOLD), value); LABEL_SERIES(VAL_SERIES(value), "scan binary"); break; diff --git a/src/core/l-types.c b/src/core/l-types.c index 63e422a1a3..ce0aa75e79 100644 --- a/src/core/l-types.c +++ b/src/core/l-types.c @@ -910,9 +910,9 @@ bad_hex: Trap0(RE_INVALID_CHARS); /*********************************************************************** ** -*/ const REBYTE *Scan_Binary(const REBYTE *cp, REBCNT len, REBVAL *value) +*/ REBINT *Scan_Binary_Base(const REBYTE *cp, REBCNT len) /* -** Scan and convert binary strings. +** Scan for binary base ** ***********************************************************************/ { @@ -926,14 +926,25 @@ bad_hex: Trap0(RE_INVALID_CHARS); cp = ep; } cp++; // skip # - if (*cp++ != '{') return 0; - len -= 2; + if (*cp++ != '{' || (len - 2) < 1) return 0; + return base; +} +/*********************************************************************** +** +*/ const REBYTE *Scan_Binary(REBINT base, const REBYTE *cp, REBCNT len, REBVAL *value) +/* +** Scan and convert binary strings according given base (like 2, 16, 64, 85). +** +***********************************************************************/ +{ + //O: no need to check the base here... Decode_Binary handles any case cp = Decode_Binary(value, cp, len, base, '}', FALSE); if (!cp) return 0; - cp = Skip_To_Char(cp, cp + len, '}'); - if (!cp) return 0; // series will be gc'd + //O: bellow check is not needed, because scener already validated the input + //cp = Skip_To_Char(cp, cp + len, '}'); + //if (!cp) return 0; // series will be gc'd return cp; } diff --git a/src/tests/units/lexer-test.r3 b/src/tests/units/lexer-test.r3 index c611a0c2b4..4cc0bed711 100644 --- a/src/tests/units/lexer-test.r3 +++ b/src/tests/units/lexer-test.r3 @@ -171,6 +171,30 @@ Rebol [ ===end-group=== +===start-group=== "BINARY" + --test-- {binary! with spaces} + --assert #{00} = first transcode/only to binary! " #{0 0}" + --assert #{00} = first transcode/only to binary! "2#{0000 00 00}" + --assert #{00} = first transcode/only to binary! "2#{0000^/0000}" + --assert #{00} = first transcode/only to binary! "2#{0000^M0000}" + --assert #{01} = first transcode/only to binary! "2#{0000^-0001}" + --assert #{02} = first transcode/only to binary! "2#{0000^ 0010}" + --assert #{0001} = first transcode/only to binary! "16#{00 01}" + --assert #{0001} = first transcode/only to binary! "64#{AA E=}" + + --test-- {binary! with comments inside} + ;@@ https://github.com/Oldes/Rebol-wishes/issues/23 + --assert #{00} = first transcode/only/error to binary! "#{;XXX^/00}" + --assert #{00} = first transcode/only/error to binary! "#{00;XXX^/}" + --assert #{0002} = first transcode/only/error to binary! "#{00;XXX^/02}" + --assert #{0002} = first transcode/only/error to binary! "#{00;XXX^M02}" ;CR is also comment stopper + --test-- {binary! with other valid escapes} + --assert #{0003} = first transcode/only/error to binary! "#{^(30)^(30)03}" + --test-- {binary! with unicode char} ; is handled early + --assert error? first transcode/only/error to binary! "#{0č}" + +===end-group=== + ===start-group=== "Special tests" ;if "true" <> get-env "CONTINUOUS_INTEGRATION" [