From 628234c64adffb91618dc78f4003d644527a3c1c Mon Sep 17 00:00:00 2001 From: Oldes Date: Fri, 27 Mar 2020 17:42:31 +0100 Subject: [PATCH] FEAT: support for Ascii85 (Base85) encoding `Ascii85` is a form of binary-to-text encoding used for example in PDF files. One can now use 85 as a `base` value in `debase` and `enbase` functions: ``` >> enbase/base "Lion" 85 == "9PJE_" >> debase/base "9PJE_" 85 == #{4C696F6E} ;== "Lion" if converted back to string ``` This addition is optional, requires compilation with `INCLUDE_BASE85` define. Implements wish: https://github.com/Oldes/Rebol-issues/issues/2410 --- make/make-settings.r | 1 + src/boot/natives.r | 4 +- src/core/f-enbase.c | 287 ++++++++++++++++++++++++++++++++- src/core/n-strings.c | 8 + src/tests/units/enbase-test.r3 | 65 ++++++++ 5 files changed, 362 insertions(+), 3 deletions(-) diff --git a/make/make-settings.r b/make/make-settings.r index c82ab81454..434a6dfa1d 100644 --- a/make/make-settings.r +++ b/make/make-settings.r @@ -18,6 +18,7 @@ Defines: [ USE_MIDI_DEVICE ;-- includes MIDI device when possible (Windows & macOS) ;INCLUDE_TASK ;-- tasks are not implemented yet, so include it only on demand + INCLUDE_BASE85 ;-- adds support for enbase/debase with base 85 (ASCII85) ;@@ optional fine tuning: ;DO_NOT_NORMALIZE_MAP_KEYS diff --git a/src/boot/natives.r b/src/boot/natives.r index 5079a7eb69..4de026aa4b 100644 --- a/src/boot/natives.r +++ b/src/boot/natives.r @@ -406,7 +406,7 @@ debase: native [ {Decodes binary-coded string (BASE-64 default) to binary value.} value [binary! string!] {The string to decode} /base {Binary base to use} - base-value [integer!] {The base to convert from: 64, 16, or 2} + base-value [integer!] {The base to convert from: 85, 64, 16, or 2} /url {Base 64 Decoding with URL and Filename Safe Alphabet} ] @@ -414,7 +414,7 @@ enbase: native [ {Encodes a string into a binary-coded string (BASE-64 default).} value [binary! string!] {If string, will be UTF8 encoded} /base {Binary base to use} - base-value [integer!] {The base to convert to: 64, 16, or 2} + base-value [integer!] {The base to convert to: 85, 64, 16, or 2} /url {Base 64 Encoding with URL and Filename Safe Alphabet} ] diff --git a/src/core/f-enbase.c b/src/core/f-enbase.c index a38ee06256..91e64a3caa 100644 --- a/src/core/f-enbase.c +++ b/src/core/f-enbase.c @@ -40,7 +40,7 @@ ***********************************************************************/ { #define BIN_ERROR (REBYTE)0x80 - #define BIN_SPACE (REBYTE)0x40 + #define BIN_SPACE (REBYTE)0x55 #define BIN_VALUE (REBYTE)0x3f #define IS_BIN_SPACE(c) (Debase64[c] & BIN_SPACE) @@ -279,6 +279,124 @@ /* 7F DEL */ BIN_ERROR, }; +#ifdef INCLUDE_BASE85 +#define BASE85_DIGITS 5 /* log85 (2^32) is 4.9926740807112 */ +/*********************************************************************** +** +*/ static const REBYTE Debase85[128] = +/* +** Base-85 (ASCII85) binary decoder table. +** +***********************************************************************/ +{ + /* Control Chars */ + BIN_ERROR,BIN_ERROR,BIN_ERROR,BIN_ERROR, /* 80 */ + BIN_ERROR,BIN_ERROR,BIN_ERROR,BIN_ERROR, + BIN_SPACE,BIN_SPACE,BIN_SPACE,BIN_ERROR, + BIN_SPACE,BIN_SPACE,BIN_ERROR,BIN_ERROR, + BIN_ERROR,BIN_ERROR,BIN_ERROR,BIN_ERROR, + BIN_ERROR,BIN_ERROR,BIN_ERROR,BIN_ERROR, + BIN_ERROR,BIN_ERROR,BIN_ERROR,BIN_ERROR, + BIN_ERROR,BIN_ERROR,BIN_ERROR,BIN_ERROR, + + /* 20 */ BIN_SPACE, + /* 21 ! */ 0, + /* 22 " */ 1, + /* 23 # */ 2, + /* 24 $ */ 3, + /* 25 % */ 4, + /* 26 & */ 5, + /* 27 ' */ 6, + /* 28 ( */ 7, + /* 29 ) */ 8, + /* 2A * */ 9, + /* 2B + */ 10, + /* 2C , */ 11, + /* 2D - */ 12, + /* 2E . */ 13, + /* 2F / */ 14, + /* 30 0 */ 15, + /* 31 1 */ 16, + /* 32 2 */ 17, + /* 33 3 */ 18, + /* 34 4 */ 19, + /* 35 5 */ 20, + /* 36 6 */ 21, + /* 37 7 */ 22, + /* 38 8 */ 23, + /* 39 9 */ 24, + /* 3A : */ 25, + /* 3B ; */ 26, + /* 3C < */ 27, + /* 3D = */ 28, + /* 3E > */ 29, + /* 3F ? */ 30, + /* 40 @ */ 31, + /* 41 A */ 32, + /* 42 B */ 33, + /* 43 C */ 34, + /* 44 D */ 35, + /* 45 E */ 36, + /* 46 F */ 37, + /* 47 G */ 38, + /* 48 H */ 39, + /* 49 I */ 40, + /* 4A J */ 41, + /* 4B K */ 42, + /* 4C L */ 43, + /* 4D M */ 44, + /* 4E N */ 45, + /* 4F O */ 46, + /* 50 P */ 47, + /* 51 Q */ 48, + /* 52 R */ 49, + /* 53 S */ 50, + /* 54 T */ 51, + /* 55 U */ 52, + /* 56 V */ 53, + /* 57 W */ 54, + /* 58 X */ 55, + /* 59 Y */ 56, + /* 5A Z */ 57, + /* 5B [ */ 58, + /* 5C \ */ 59, + /* 5D ] */ 60, + /* 5E ^ */ 61, + /* 5F _ */ 62, + /* 60 ` */ 63, + /* 61 a */ 64, + /* 62 b */ 65, + /* 63 c */ 66, + /* 64 d */ 67, + /* 65 e */ 68, + /* 66 f */ 69, + /* 67 g */ 70, + /* 68 h */ 71, + /* 69 i */ 72, + /* 6A j */ 73, + /* 6B k */ 74, + /* 6C l */ 75, + /* 6D m */ 76, + /* 6E n */ 77, + /* 6F o */ 78, + /* 70 p */ 79, + /* 71 q */ 80, + /* 72 r */ 81, + /* 73 s */ 82, + /* 74 t */ 83, + /* 75 u */ 84, + /* 76 v */ BIN_ERROR, + /* 77 w */ BIN_ERROR, + /* 78 x */ BIN_ERROR, + /* 79 y */ BIN_ERROR, + /* 7A z */ BIN_ERROR, + /* 7B { */ BIN_ERROR, + /* 7C | */ BIN_ERROR, + /* 7D } */ BIN_ERROR, + /* 7E ~ */ BIN_ERROR, + /* 7F DEL */ BIN_ERROR +}; +#endif /*********************************************************************** ** @@ -307,6 +425,22 @@ }; +#ifdef INCLUDE_BASE85 +/*********************************************************************** +** +*/ static const REBYTE Enbase85[85] = +/* +** Base-85 binary encoder table. +** +***********************************************************************/ +{ + "!\"#$%&'()*+,-./0123456789:;<=>?@" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" + "abcdefghijklmnopqrstu" +}; +#endif + + /*********************************************************************** ** */ static REBSER *Decode_Base2(const REBYTE **src, REBCNT len, REBYTE delim) @@ -511,6 +645,89 @@ } +#ifdef INCLUDE_BASE85 +/*********************************************************************** +** +*/ static REBSER *Decode_Base85(const REBYTE **src, REBCNT len, REBYTE delim) +/* +***********************************************************************/ +{ + REBYTE *bp; + const REBYTE *cp; + REBSER *ser; + REBCNT ser_size; + REBCNT chunk; + REBCNT pos, c; + REBINT pad=0; + + // Allocate buffer large enough to hold result: + ser = Make_Binary(((len + 4) / 5) * 4); + ser_size = SERIES_AVAIL(ser); + + bp = STR_HEAD(ser); + cp = *src; + + pos = 0; + while(len > 0) { + if(pos >= ser_size) { + // in extreme cases (a lot of 'z' chars) initialy computed size may not be enough + SERIES_TAIL(ser) = ser_size; // sets current series' tail (used by expand function bellow) + Expand_Series(ser, ser_size, 8); // may expand more than 8 bytes + bp = STR_HEAD(ser); // because above could reallocate + ser_size = SERIES_AVAIL(ser); + } + /* 'z' is a special way to encode 4 bytes of 0s */ + if(*cp=='z') { + cp++; + len--; + bp[pos++] = 0u; + bp[pos++] = 0u; + bp[pos++] = 0u; + bp[pos++] = 0u; + continue; + } + chunk = 0; + for(c=0; c 0) { + len--; + d=Debase85[(REBYTE)*cp++]; + if (d == BIN_SPACE) { + // ignore spaces + c--; continue; + } + if (d > 127) goto err; /* failure - invalid character */ + } + else { + if(!pad) chunk++; + pad++; + } + if(c == 4) { + // math overflow checking.. for example input: {s8W-"} + if (chunk > (MAX_U32 / 85u)) goto err; + chunk *= 85; + if (chunk > (MAX_U32 - d)) goto err; + chunk += d; + } else chunk = chunk * 85 + d; + } + bp[pos ] = (REBYTE)(chunk >> 24); + bp[pos+1] = (REBYTE)(chunk >> 16); + bp[pos+2] = (REBYTE)(chunk >> 8); + bp[pos+3] = (REBYTE)(chunk ); + pos += 4; + } + bp[pos] = 0; + ser->tail = pos - pad; + return ser; + +err: + Free_Series(ser); + *src = cp; + return 0; +} +#endif + + /*********************************************************************** ** */ const REBYTE *Decode_Binary(REBVAL *value, const REBYTE *src, REBCNT len, REBINT base, REBYTE delim, REBOOL urlSafe) @@ -531,6 +748,13 @@ case 2: ser = Decode_Base2 (&src, len, delim); break; + case 85: +#ifdef INCLUDE_BASE85 + ser = Decode_Base85 (&src, len, delim); +#else + Trap0(RE_FEATURE_NA); +#endif + break; } if (!ser) return 0; @@ -674,3 +898,64 @@ return series; } + + +#ifdef INCLUDE_BASE85 +/*********************************************************************** +** +*/ REBSER *Encode_Base85(REBVAL *value, REBSER *series, REBFLG brk) +/* +** Base85 encode a given series. Must be BYTES, not UNICODE. +** +***********************************************************************/ +{ + REBCNT len; + REBYTE *bp; + REBYTE *src; + REBCNT x=0; + REBINT loop; + REBCNT i, chunk; + + len = VAL_LEN(value); + src = VAL_BIN_DATA(value); + + // Account for hex, lines, and extra syntax: + series = Prep_String(series, &bp, ((len + 3) / 4) * 5); + // (Note: tail not properly set yet) + + //if (len >= 32 && brk) *bp++ = LF; + loop = (len / 4) - 1; + if(loop >= 0) { + for (x = 0; x <= 4 * loop;) { + chunk = ((REBCNT)src[x++]) << 24u; + chunk |= ((REBCNT)src[x++]) << 16u; + chunk |= ((REBCNT)src[x++]) << 8u; + chunk |= ((REBCNT)src[x++]) ; + if(chunk==0) { + *bp++='z'; /* this is a special zero character */ + } else { + for(i = BASE85_DIGITS;i--;) { + bp[i] = Enbase85[chunk%85]; + chunk /= 85; + } + bp += 5; + } + } + } + if ((len % 4) != 0) { + chunk = (((REBCNT)src[x++]) << 24u); + chunk |= ((x < (REBCNT)len) ? (((REBCNT)src[x++]) << 16u): 0u); + chunk |= ((x < (REBCNT)len) ? (((REBCNT)src[x++]) << 8u): 0u); + chunk |= ((x < (REBCNT)len) ? (((REBCNT)src[x++]) ): 0u); + for(i = BASE85_DIGITS;i--;) { + bp[i] = Enbase85[chunk%85]; + chunk /= 85; + } + bp += (len % 4) + 1; + } + *bp = 0; + SERIES_TAIL(series) = DIFF_PTRS(bp, series->data); + + return series; +} +#endif \ No newline at end of file diff --git a/src/core/n-strings.c b/src/core/n-strings.c index 3a21f3e28f..70e321ce60 100644 --- a/src/core/n-strings.c +++ b/src/core/n-strings.c @@ -497,6 +497,14 @@ static struct digest { break; case 2: ser = Encode_Base2(arg, 0, FALSE); + break; + case 85: +#ifdef INCLUDE_BASE85 + ser = Encode_Base85(arg, 0, FALSE); +#else + Trap0(RE_FEATURE_NA); +#endif + break; default: Trap_Arg(D_ARG(3)); diff --git a/src/tests/units/enbase-test.r3 b/src/tests/units/enbase-test.r3 index d4f1e7d3a9..bd451177e1 100644 --- a/src/tests/units/enbase-test.r3 +++ b/src/tests/units/enbase-test.r3 @@ -114,4 +114,69 @@ Rebol [ ===end-group=== +if any [ + not error? err: try [enbase/base "a" 85] + err/id <> 'feature-na +][ + base85-str-tests: [ + "" "" + "h" "BE" + "he" "BOq" + "hel" "BOtu" + "hell" "BOu!r" + "hello" "BOu!rDZ" + "hello " "BOu!rD]f" + "hello w" "BOu!rD]j6" + "hello wo" "BOu!rD]j7B" + "hello wor" "BOu!rD]j7BEW" + "hello worl" "BOu!rD]j7BEbk" + "hello world" "BOu!rD]j7BEbo7" + "hello world!" "BOu!rD]j7BEbo80" + ] + base85-bin-tests: [ + #{00} {!!} + #{0000} {!!!} + #{000000} {!!!!} + #{00000000} {z} + #{0000000000000000} {zz} + #{000000000000000000} {zz!!} + #{0100000000} {!<<*"!!} + #{ffd8ffe0} {s4IA0} + #{ffffffff} {s8W-!} + ] + base85-spaces-tests: [ + #{68} "B E" + #{68} "B^-E" + #{68} "B^/E" + #{68} "B^ME" + #{68656C6C6F} "BOu!rDZ " + #{68656C6C6F} "BOu!rDZ^-" + #{68656C6C6F} "BOu!rDZ^/" + #{68656C6C6F} "BOu!rDZ^M" + ] + + + ===start-group=== "enbase-85" + --test-- "enbase/base str 85" + foreach [inp out] base85-str-tests [ + --assert out = enbase/base inp 85 + ] + ===end-group=== + + ===start-group=== "debase-85" + --test-- "debase/base str 85" + foreach [out inp] base85-str-tests [ + --assert out = probe to-string debase/base inp 85 + ] + --test-- "debase 85 with spaces" + foreach [out inp] base85-spaces-tests [ + --assert out = debase/base inp 85 + ] + --test-- "invalid debase85 input" + --assert error? try [debase/base "abcx" 85] + --assert error? try [debase/base "~>" 85] + --assert error? try [debase/base {s8W-"} 85] + ===end-group=== +] + ~~~end-file~~~ \ No newline at end of file