diff --git a/.gitignore b/.gitignore index 6463dc6..efbff5e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ sf_build/u-ctenolophon-gawkextlib* aregex.so +aregex.dll diff --git a/Makefile b/Makefile index 21a2436..d643790 100644 --- a/Makefile +++ b/Makefile @@ -1,19 +1,21 @@ PREFIX = /usr/local BASE = aregex +SHLIBEXT = .so -$(BASE).so: $(BASE).c - gcc -shared -lgawkextlib -ltre -Wall -Wextra -Wno-unused-parameter -Wmissing-prototypes -Wpointer-arith -Wcast-qual -Wwrite-strings -Wshadow -g -O2 -fPIC -o $(BASE).so $(BASE).c +$(BASE)$(SHLIBEXT): $(BASE).c + gcc $(CFLAGS) -shared -Wall -Wextra -Wno-unused-parameter -Wmissing-prototypes -Wpointer-arith -Wcast-qual -Wwrite-strings -Wshadow -g -O2 -fPIC -o $@ $< -lgawkextlib -ltre -check: $(BASE).so test/$(BASE).awk test/$(BASE).ok +check: $(BASE)$(SHLIBEXT) test/$(BASE).awk test/$(BASE).ok gawk -f test/$(BASE).awk > test/$(BASE).tmp bash -c "if [ -z `diff test/$(BASE).ok test/$(BASE).tmp` ] ; then echo '** PASS **'; else echo '** FAIL **' ; fi " rm -f test/$(BASE).tmp -install: $(BASE).so doc/$(BASE).3am +install: $(BASE)$(SHLIBEXT) doc/$(BASE).3am mkdir -p $(PREFIX)/lib/gawk - cp -f $(BASE).so $(PREFIX)/lib/gawk/. + cp -f $(BASE)$(SHLIBEXT) $(PREFIX)/lib/gawk/. mkdir -p $(PREFIX)/share/man/man3 cp -f doc/$(BASE).3am $(PREFIX)/share/man/man3/. -man: doc/$(BASE).md - pandoc -s -t man -o doc/$(BASE).3am doc/$(BASE).md +man : doc/$(BASE).3am +doc/$(BASE).3am: doc/$(BASE).md + which pandoc >/dev/null 2>&1 && pandoc -s -t man -o $@ $< || touch $@ diff --git a/aregex.c b/aregex.c index c598c10..9762c09 100644 --- a/aregex.c +++ b/aregex.c @@ -9,25 +9,84 @@ #include #include #include -#include +#include #include #define MAXNSUBMATCH 20 // Max Number of parenthetical substring matches -#define DEFMAXCOST 5 // Default max_cost for match +#define DEFMAXCOST 5 // Default max_cost for match #define DEBUG 0 // Print debug info +#define PACKAGE_STRING "aregex 1.0.1" + // Gawkextlib boilerplate: static const gawk_api_t *api; static awk_ext_id_t ext_id; int plugin_is_GPL_compatible; +/* regex hash table */ +static strhash *ht_regex; +#ifdef AREGEX_MEM_DEBUG +static int ht_regex_n_alloced = 0; +#endif + +/* hash element destructor */ +static void +he_data_destroy (void *data, void *opaque, strhash *ht, strhash_entry *he) +{ + if (he && he->data) { + tre_regfree (he->data); + gawk_free (he->data); + #ifdef AREGEX_MEM_DEBUG + ht_regex_n_alloced -= sizeof(regex_t); + #endif + he->data = NULL; + } +} + +/* look up regex in cache and create it if not found */ +static regex_t * +tre_regex_lookup (const char* pattern, size_t pattern_len) +{ + strhash_entry * he; + + he = strhash_get (ht_regex, pattern, pattern_len, 0); + if (!he) { + regex_t * rx; + size_t sz; + int rc; + int flags; + flags = REG_EXTENDED; + sz = sizeof (regex_t); + rx = gawk_calloc (1, sz); + #ifdef AREGEX_MEM_DEBUG + ht_regex_n_alloced += sz; + #endif + rc = tre_regncomp (rx, pattern, pattern_len, flags); + if ( rc == REG_OK) { + he = strhash_get (ht_regex, pattern, pattern_len, 1); /* he ensured not to be NULL */ + he->data = rx; + } else { + /* regexp compilation failed */ + char tre_err_buf[128], err_buf[256]; + tre_regerror(rc, rx, tre_err_buf, sizeof tre_err_buf); + snprintf(err_buf, sizeof err_buf, "aregex: tre: %s in /%s/", tre_err_buf, pattern); + update_ERRNO_string(err_buf); + gawk_free (rx); + #ifdef AREGEX_MEM_DEBUG + ht_regex_n_alloced -= sz; + #endif + } + } + return he ? he->data : NULL; +} + // Main amatch() function definition static awk_value_t * do_amatch(int nargs, awk_value_t *result \ , struct awk_ext_func *unused) { - int i; - + int i; + // 1. Set default costs const char *parami[8]; int paramv[8]; @@ -39,7 +98,7 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ parami[5] = "max_ins"; paramv[5] = DEFMAXCOST; parami[6] = "max_subst"; paramv[6] = DEFMAXCOST; parami[7] = "max_err"; paramv[7] = DEFMAXCOST; - + // 2. Read 3rd, 'costs' argument, if present // (these variable declarations outside, because needed during output: ) awk_value_t costs; @@ -47,7 +106,7 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ awk_value_t costindex; awk_value_t costval; awk_bool_t hascostarr = 0; - + if (nargs > 2) { // if just a simple integer for 3rd argument: if (get_argument(2, AWK_NUMBER, &simplecost)) { @@ -59,8 +118,7 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ } else if (get_argument(2, AWK_ARRAY, &costs)) { hascostarr = 1; - - char c[30]; + for (i = 0; i < 8; i++) { // create an index for reading array make_const_string(parami[i], strlen(parami[i]), &costindex); @@ -70,9 +128,7 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ // update the cost value paramv[i] = atoi(costval.str_value.str); if (DEBUG) { - strcpy(c,"") ; - sprintf(c, "cost %s = %d", parami[i], atoi(costval.str_value.str)); - warning(ext_id, c); + warning(ext_id, "cost %s = %d", parami[i], atoi(costval.str_value.str)); } } } @@ -91,10 +147,11 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ // ( for wchar_t: // wchar_t rew[] = L""; // swprintf(rew, strlen(re.str_value.str), L"%ls", re.str_value.str); ) - + // 4. Compile regex - regex_t preg; - tre_regcomp(&preg, re.str_value.str, REG_EXTENDED); + regex_t *preg; + preg = tre_regex_lookup(re.str_value.str, re.str_value.len); + if(!preg) return make_number(-1, result); // ( for wchar_t: // tre_regwcomp(&preg, rew, REG_EXTENDED); ) @@ -102,7 +159,7 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ // 5. Do the match // set approx match params regaparams_t params = { 0 }; - params.cost_ins = paramv[0]; + params.cost_ins = paramv[0]; params.cost_del = paramv[1]; params.cost_subst = paramv[2]; params.max_cost = paramv[3]; @@ -113,16 +170,17 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ // create necessary structure for details of match regamatch_t match ; - match.nmatch = MAXNSUBMATCH; - match.pmatch = (regmatch_t *) malloc(MAXNSUBMATCH * sizeof(regmatch_t)); - + regmatch_t pmatch[MAXNSUBMATCH]; + match.nmatch = MAXNSUBMATCH; + match.pmatch = &pmatch[0]; + // do the approx regexp itself! int treret; - treret = tre_regaexec(&preg, str.str_value.str, &match, params, 0); + treret = tre_regaexec(preg, str.str_value.str, &match, params, 0); // ( for wchar_t: // treret = tre_regawexec(&pregw, rew, &match, params, 0); ) - + // set the amatch() return value depending on tre_regaexec() return // 1 if success, 0 if no match int rval = 1; @@ -132,48 +190,52 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ if (treret == REG_ESPACE) { warning(ext_id, \ "amatch: TRE err., mem. insufficient to complete the match."); - free(match.pmatch); - tre_regfree(&preg); return make_null_string(result); } // 6. If there is a cost array, set some return values (if a match) if ((hascostarr) && (rval)) { + int n; char matchcost[20]; // Single integers, max width ~= 10 + #define COST_LEN 4 + #define NUM_INS_LEN 7 + #define NUM_DEL_LEN 7 + #define NUM_SUBST_LEN 9 // cost del_array_element(costs.array_cookie, \ - make_const_string("cost", strlen("cost"), &costindex)); - sprintf(matchcost, "%d", match.cost); + make_const_string("cost", COST_LEN, &costindex)); + n = sprintf(matchcost, "%d", match.cost); set_array_element(costs.array_cookie, \ - make_const_string("cost", strlen("cost"), &costindex), \ - make_const_string(matchcost, strlen(matchcost), &costval)); + make_const_string("cost", COST_LEN, &costindex), \ + make_const_string(matchcost, n, &costval)); // num_ins del_array_element(costs.array_cookie, \ - make_const_string("num_ins", strlen("num_ins"), &costindex)); - sprintf(matchcost, "%d", match.num_ins); + make_const_string("num_ins", NUM_INS_LEN, &costindex)); + n = sprintf(matchcost, "%d", match.num_ins); set_array_element(costs.array_cookie, \ - make_const_string("num_ins", strlen("num_ins"), &costindex), \ - make_const_string(matchcost, strlen(matchcost), &costval)); + make_const_string("num_ins", NUM_INS_LEN, &costindex), \ + make_const_string(matchcost, n, &costval)); // num_del del_array_element(costs.array_cookie, \ - make_const_string("num_del", strlen("num_del"), &costindex)); - sprintf(matchcost, "%d", match.num_del); + make_const_string("num_del", NUM_DEL_LEN, &costindex)); + n = sprintf(matchcost, "%d", match.num_del); set_array_element(costs.array_cookie, \ - make_const_string("num_del", strlen("num_del"), &costindex), \ - make_const_string(matchcost, strlen(matchcost), &costval)); + make_const_string("num_del", NUM_DEL_LEN, &costindex), \ + make_const_string(matchcost, n, &costval)); // num_subst del_array_element(costs.array_cookie, \ - make_const_string("num_subst", strlen("num_subst"), &costindex)); - sprintf(matchcost, "%d", match.num_subst); + make_const_string("num_subst", NUM_SUBST_LEN, &costindex)); + n = sprintf(matchcost, "%d", match.num_subst); set_array_element(costs.array_cookie, \ - make_const_string("num_subst", strlen("num_subst"), &costindex), \ - make_const_string(matchcost, strlen(matchcost), &costval)); + make_const_string("num_subst", NUM_SUBST_LEN, &costindex), \ + make_const_string(matchcost, n, &costval)); } - + // 7. Set 4th argument array, for matched substrings, if present // and if a match found if ((nargs == 4) && (rval)) { - awk_value_t substr; + int n,m; + awk_value_t substr; // read 4th argument if (!get_argument(3, AWK_ARRAY, &substr)) { warning(ext_id, "amatch: Could not read 4th argument."); @@ -195,21 +257,19 @@ static awk_value_t * do_amatch(int nargs, awk_value_t *result \ for (i = 0 ; i < (int) match.nmatch; i++) { if (match.pmatch[i].rm_so != -1) { - sprintf(outindexc, "%d", i); + n = sprintf(outindexc, "%d", i); // ( "%d %.*s", match.pmatch[i].rm_so+1, ... gives position // by bytes, not by chars ) - sprintf(outvalc, "%.*s", \ + m = sprintf(outvalc, "%.*s", \ match.pmatch[i].rm_eo - match.pmatch[i].rm_so, \ str.str_value.str + match.pmatch[i].rm_so); - set_array_element(substr.array_cookie, - make_const_string(outindexc, strlen(outindexc), &outindexp), \ - make_const_string(outvalc, strlen(outvalc), &outvalp)); + set_array_element(substr.array_cookie, + make_const_string(outindexc, n, &outindexp), \ + make_const_string(outvalc, m, &outvalp)); } } } - - free(match.pmatch); - tre_regfree(&preg); + return make_number(rval, result); } @@ -221,8 +281,32 @@ static awk_ext_func_t func_table[] = \ { "amatch", do_amatch, 4, 2, awk_false, NULL }, }; -static awk_bool_t (*init_func)(void) = NULL; +/* procedure run on exiting gawk and the extension */ +static void +aregex_awk_atexit (void* data, int exit_status) +{ + strhash_destroy (ht_regex, he_data_destroy, NULL); + #ifdef AREGEX_MEM_DEBUG + if(ht_regex_n_alloced) + warning(ext_id,"aregex: memory leakage: %d bytes", ht_regex_n_alloced); + #endif +} + +/* initialize extension */ +static char ext_version[512]; +static void set_ext_version(void) { + snprintf(ext_version, sizeof ext_version, "%s (%s)", PACKAGE_STRING, tre_version()); +} + +static awk_bool_t +aregex_init_func (void) +{ + ht_regex = strhash_create (0); + awk_atexit (aregex_awk_atexit, NULL); + set_ext_version(); + return awk_true; +} -static const char *ext_version = "0.1"; +static awk_bool_t (*init_func)(void) = aregex_init_func; dl_load_func(func_table, amatch, "") diff --git a/doc/aregex.3am b/doc/aregex.3am index 0f6c023..0f71f35 100644 --- a/doc/aregex.3am +++ b/doc/aregex.3am @@ -1,13 +1,13 @@ -.\" Automatically generated by Pandoc 2.3.1 +.\" Automatically generated by Pandoc 2.10 .\" .TH "AREGEX" "3am" "Nov 24 2018" "Free Software Foundation" "GNU Awk Extension Modules" .hy .SH NAME .PP -aregex \- approximate (fuzzy) string matching with regular expressions +aregex - approximate (fuzzy) string matching with regular expressions .SH SYNOPSIS .PP -\@load \f[C]"aregex"\f[] +\[at]load \f[C]\[dq]aregex\[dq]\f[R] .PP success = amatch(str, regex [, cost|costs [, submatches]]) .SH DESCRIPTION @@ -21,150 +21,153 @@ For example, .IP .nf \f[C] -"abcdef" -"abcxdef"\ \ #\ one\ insertion -"abdef"\ \ \ \ #\ one\ deletion -"abxdef"\ \ \ #\ one\ substitution -\f[] +\[dq]abcdef\[dq] +\[dq]abcxdef\[dq] # one insertion +\[dq]abdef\[dq] # one deletion +\[dq]abxdef\[dq] # one substitution +\f[R] .fi .PP The cost of the match (the Levenshtein distance between strings) can be reported. This Gawk extension provides an interface with the -\f[I]tre_regaexec()\f[] function in the TRE library, permitting the +\f[I]tre_regaexec()\f[R] function in the TRE library, permitting the setting of all possible parameters for that function, and returning all possible information about a match. .SS Function summary .PP -A single function, \f[B]amatch()\f[] is provided, modeled on the Gawk -\f[I]match()\f[] function: +A single function, \f[B]amatch()\f[R] is provided, modeled on the Gawk +\f[I]match()\f[R] function: .PP -\f[B]amatch(\f[] \f[I]str\f[] \f[B],\f[] \f[I]regex\f[] \f[B][,\f[] -\f[I]cost\f[]|\f[I]costs\f[] \f[B][,\f[] \f[I]submatches\f[] \f[B]]\f[] -\f[B]]\f[] \f[B])\f[] +\f[B]amatch(\f[R] \f[I]str\f[R] \f[B],\f[R] \f[I]regex\f[R] \f[B][,\f[R] +\f[I]cost\f[R]|\f[I]costs\f[R] \f[B][,\f[R] \f[I]submatches\f[R] +\f[B]]\f[R] \f[B]]\f[R] \f[B])\f[R] .PP This function takes two mandatory string arguments, and two optional arguments. -\f[I]regex\f[] is an \f[B]extended\f[] regular expression (or plain -string) to be matched against string \f[I]str\f[]. -Note that the regular expression \f[I]regex\f[] is bounded by -double\-quotes, not by the usual Gawk slashes. +\f[I]regex\f[R] is an \f[B]extended\f[R] regular expression (or plain +string) to be matched against string \f[I]str\f[R]. +Note that the regular expression \f[I]regex\f[R] is bounded by +double-quotes, not by the usual Gawk slashes. .SS Setting approximate match costs .PP With only two arguments, the default maximum cost for the approximate -match is set to 5, and other costs are set as below. -The maximum cost may also be set by the user via an optional third -parameter: either an integer (\f[I]cost\f[]), or a member of a -one\-dimensional array (\f[I]costs\f[]) indexed by \f[C]"max_cost"\f[]. +match is set to 5 (and other costs are set as below). +The maximum cost may also be set by the user via an optional \f[B]third +argument\f[R]: either an integer (\f[I]cost\f[R]), or a member of a +one-dimensional array (\f[I]costs\f[R]) indexed by +\f[C]\[dq]max_cost\[dq]\f[R]. Setting maximum cost to 0 forces an exact regular expression match, as -with Gawk's \f[I]match()\f[]. -Other members of the \f[I]costs\f[] array with appropriate index values -will set the parameters of the \f[I]regaparams_t\f[] structure used by -\f[I]tre_regaexec()\f[]: +with Gawk\[cq]s \f[I]match()\f[R]. +Other members of the \f[I]costs\f[R] array with appropriate index values +will set the parameters of the \f[I]regaparams_t\f[R] structure used by +\f[I]tre_regaexec()\f[R]: .IP .nf \f[C] -Array\ index\ \ \ Parameter\ \ \ \ Def\ val\ \ Meaning -============\ \ ===========\ \ =======\ \ =====================\ \ \ \ \ -"cost_ins"\ \ \ \ .cost_ins\ \ \ \ \ \ \ 1\ \ \ \ \ Cost\ of\ one\ insertion -"cost_del"\ \ \ \ .cost_del\ \ \ \ \ \ \ 1\ \ \ \ \ Cost\ of\ one\ deletion -"cost_subst"\ \ .cost_subst\ \ \ \ \ 1\ \ \ \ \ Cost\ of\ one\ substitution -"max_cost"\ \ \ \ .max_cost\ \ \ \ \ \ \ 5\ \ \ \ \ Max.\ cost -"max_del"\ \ \ \ \ .max_del\ \ \ \ \ \ \ \ 5\ \ \ \ \ Max.\ number\ of\ deletions -"max_ins"\ \ \ \ \ .max_ins\ \ \ \ \ \ \ \ 5\ \ \ \ \ Max.\ number\ of\ insertions -"max_subst"\ \ \ .max_subst\ \ \ \ \ \ 5\ \ \ \ \ Max.\ number\ of\ substitutions -"max_err"\ \ \ \ \ .max_err\ \ \ \ \ \ \ \ 5\ \ \ \ \ Max.\ number\ of\ ins+del+subst -\f[] +Array index Parameter Def val Meaning +============ =========== ======= ===================== +\[dq]cost_ins\[dq] .cost_ins 1 Cost of one insertion +\[dq]cost_del\[dq] .cost_del 1 Cost of one deletion +\[dq]cost_subst\[dq] .cost_subst 1 Cost of one substitution +\[dq]max_cost\[dq] .max_cost 5 Max. cost +\[dq]max_del\[dq] .max_del 5 Max. number of deletions +\[dq]max_ins\[dq] .max_ins 5 Max. number of insertions +\[dq]max_subst\[dq] .max_subst 5 Max. number of substitutions +\[dq]max_err\[dq] .max_err 5 Max. number of ins+del+subst +\f[R] .fi .PP -If the array \f[I]costs\f[] is provided but contains none of the above +If the array \f[I]costs\f[R] is provided but contains none of the above indexes, the default values are used. .SS Return value .PP -The \f[B]amatch()\f[] function returns 1 on a successful match and 0 on -a failure to match. +The \f[B]amatch()\f[R] function returns 1 on a successful match, 0 on a +failure to match and -1 if \f[I]regex\f[R] is invalid (with TRE\[cq]s +error message in \f[I]ERRNO\f[R]) . .SS Obtaining match summary data .PP -If a third array argument is provided to \f[B]amatch()\f[], and a match -was successful, information about the match is return via (clearing and) -filling members of the \f[I]costs\f[] array with these indexes: +If a \f[B]third array argument\f[R] is provided to \f[B]amatch()\f[R], +and a match was successful, information about the match is returned via +(clearing and) filling members of the \f[I]costs\f[R] array with these +indexes: .IP .nf \f[C] -Array\ index\ \ \ Meaning -============\ \ ============================================== -"cost"\ \ \ \ \ \ \ \ Total\ cost\ of\ the\ match\ (Levenshtein\ distance)\ \ -"num_ins"\ \ \ \ \ Total\ number\ of\ insertions -"num_del"\ \ \ \ \ Total\ number\ of\ deletions -"num_subst"\ \ \ Total\ number\ of\ substitutions -\f[] +Array index Meaning +============ ============================================== +\[dq]cost\[dq] Total cost of the match (Levenshtein distance) +\[dq]num_ins\[dq] Total number of insertions +\[dq]num_del\[dq] Total number of deletions +\[dq]num_subst\[dq] Total number of substitutions +\f[R] .fi .SS Obtaining parenthetical submatches .PP If an array (or empty Gawk variable symbol) is provided as the -\f[B]fourth argument\f[] , and a match is successful, the array will be +\f[B]fourth argument\f[R] , and a match is successful, the array will be cleared and filled with submatches corresponding to the parenthetical -sub\-expression in \f[I]regex\f[], with indexes \f[I]1\&...n\f[], up to +sub-expression in \f[I]regex\f[R], with indexes \f[I]1\&...n\f[R], up to a maximum of 20. -The array member indexed by \f[I]0\f[] will be the portion of -\f[I]str\f[] matched by the whole of \f[I]re\f[]. -.PP -\f[B]A note on bytes and characters\f[]: While the \f[B]amatch()\f[] -function is roughly equivalent to the Gawk \f[I]match()\f[] function, -submatches are not returned as in \f[I]match()\f[] via -\f[I][i,\[lq]start\[rq]]\f[] position and \f[I][i,\[lq]length\[rq]]\f[] -(see Gawk \f[C]man\f[] page). +The array member indexed by \f[I]0\f[R] will be the portion of +\f[I]str\f[R] matched by the whole of \f[I]regex\f[R]. +.PP +\f[B]A note on bytes and characters\f[R]: While the \f[B]amatch()\f[R] +function is roughly equivalent to the Gawk \f[I]match()\f[R] function, +submatches are not returned as in \f[I]match()\f[R], e.g.\ via +\f[I][i,\[lq]start\[rq]]\f[R] position and +\f[I][i,\[lq]length\[rq]]\f[R] (see Gawk \f[C]man\f[R] page). Instead only the literal substring for each parenthetical match is given. -Gawk is multibyte aware, and \f[I]match()\f[] works in terms of -characters, not bytes, but TRE is byte\-based, not character\-based. -Using the \f[I]wchar_t\f[] versions of TRE functions cannot help if the -input is a mix of single and multi\-byte characters. +Gawk is multibyte aware, and \f[I]match()\f[R] works in terms of +characters, not bytes, but TRE is byte-based, not character-based. +Using the \f[I]wchar_t\f[R] versions of TRE functions cannot help if the +input is a mix of single and multi-byte characters. A simple Gawk routine must be used on the output array -(\f[I]submatches\f[]), if positions and lengths of the substrings are +(\f[I]submatches\f[R]), if positions and lengths of the substrings are needed. E.g.: .IP .nf \f[C] -print\ "i",\ "substring",\ "posn",\ "length" -p\ =\ 1 -for\ (i\ =\ 1;\ i\ <\ length(submatches);\ i++)\ { -\ \ idx\ =\ index(substr(str,\ p),\ submatches[i]) -\ \ len\ =\ length(out[i]) -\ \ print\ i,\ submatches[i],\ idx+p\-1,\ len -\ \ p\ =\ p\ +\ idx\ +\ len +print \[dq]i\[dq], \[dq]substring\[dq], \[dq]posn\[dq], \[dq]length\[dq] +p = 1 +for (i = 1; i < length(submatches); i++) { + idx = index(substr(str, p), submatches[i]) + len = length(out[i]) + print i, submatches[i], idx+p-1, len + p = p + idx + len } -\f[] +\f[R] .fi .SH EXAMPLE .IP .nf \f[C] -\@load\ "aregex" -BEGIN\ { -\ \ str\ =\ "abcdễfbc" -\ \ regex\ =\ "^a(bc)d(ễ)(f)$" -\ \ costs["max_cost"]\ =\ 6 -\ \ costs["cost_ins"]\ =\ 2 -\ \ if\ (amatch(str,\ regex,\ costs,\ submatches)) -\ \ \ \ print\ costs["cost"],\ submatches[1] -}\ \ \ \ -\f[] +\[at]load \[dq]aregex\[dq] +BEGIN { + str = \[dq]abcd\[u1EC5]fbc\[dq] + regex = \[dq]\[ha]a(bc)d(\[u1EC5])(f)$\[dq] + costs[\[dq]max_cost\[dq]] = 6 + costs[\[dq]cost_ins\[dq]] = 2 + if (amatch(str, regex, costs, submatches)>0) + print costs[\[dq]cost\[dq]], submatches[1] +} +\f[R] .fi .SH SEE ALSO .PP -The Gawk extension lib: https://sourceforge.net/projects/gawkextlib/ TRE -library: https://laurikari.net/tre/ +The Gawk extension lib: https://sourceforge.net/projects/gawkextlib/ and +TRE library: https://laurikari.net/tre/ .SH AUTHORS .PP -Cam Webb , \@laurikari for the TRE library, the -\f[I]gawkextlib\f[] authors +Cam Webb , \[at]laurikari for the TRE library, the +\f[I]gawkextlib\f[R] authors .SH COPYING PERMISSIONS .PP -Copyright © 2018, the Free Software Foundation, Inc. +Copyright \[co] 2018, the Free Software Foundation, Inc. .PP -Copyright © 2018, Campbell O. +Copyright \[co] 2018, Campbell O. Webb .PP Permission is granted to make and distribute verbatim copies of this @@ -178,5 +181,5 @@ permission notice identical to this one. .PP Permission is granted to copy and distribute translations of this manual page into another language, under the above conditions for modified -versions, except that this permission notice may be stated in a trans‐ -lation approved by the Foundation. +versions, except that this permission notice may be stated in a +trans\[hy] lation approved by the Foundation. diff --git a/doc/aregex.md b/doc/aregex.md index fa847ba..cad520c 100644 --- a/doc/aregex.md +++ b/doc/aregex.md @@ -1,5 +1,5 @@ % AREGEX(3am) Free Software Foundation | GNU Awk Extension Modules -% +% % Nov 24 2018 # NAME @@ -18,7 +18,7 @@ The TRE library (ref. below) provides approximate matching regex capabilities. A match between two strings that differ in some number of characters will be found when the cost of character insertions, deletions and substitutions does not exceed some specified maximum -cost. For example, +cost. For example, ``` "abcdef" @@ -31,7 +31,7 @@ The cost of the match (the Levenshtein distance between strings) can be reported. This Gawk extension provides an interface with the _tre\_regaexec()_ function in the TRE library, permitting the setting of all possible parameters for that function, and returning all -possible information about a match. +possible information about a match. ## Function summary @@ -59,7 +59,7 @@ of the _regaparams\_t_ structure used by _tre\_regaexec()_: ``` Array index Parameter Def val Meaning -============ =========== ======= ===================== +============ =========== ======= ===================== "cost_ins" .cost_ins 1 Cost of one insertion "cost_del" .cost_del 1 Cost of one deletion "cost_subst" .cost_subst 1 Cost of one substitution @@ -76,9 +76,10 @@ indexes, the default values are used. ## Return value The **amatch()** function returns 1 on a successful -match and 0 on a failure to match. +match, 0 on a failure to match and -1 if _regex_ is invalid +(with TRE's error message in _ERRNO_) . -## Obtaining match summary data +## Obtaining match summary data If a **third array argument** is provided to **amatch()**, and a match was successful, information about the match is returned via (clearing and) @@ -87,7 +88,7 @@ filling members of the _costs_ array with these indexes: ``` Array index Meaning ============ ============================================== -"cost" Total cost of the match (Levenshtein distance) +"cost" Total cost of the match (Levenshtein distance) "num_ins" Total number of insertions "num_del" Total number of deletions "num_subst" Total number of substitutions @@ -133,9 +134,9 @@ BEGIN { regex = "^a(bc)d(ễ)(f)$" costs["max_cost"] = 6 costs["cost_ins"] = 2 - if (amatch(str, regex, costs, submatches)) + if (amatch(str, regex, costs, submatches)>0) print costs["cost"], submatches[1] -} +} ``` # SEE ALSO diff --git a/sf_build/README.md b/sf_build/README.md index 21794dc..52b3a80 100644 --- a/sf_build/README.md +++ b/sf_build/README.md @@ -4,8 +4,6 @@ The Github version remains the master, but all changes can be pushed to SF using this scripts. The cloned SF directory tree will be deleted, but the history of changes lives at SF. -The `aregex.c.patch` patch should find all the places in `aregex.c` -where change is needed, even if the Github version has been updated. ## Files than need changing @@ -17,7 +15,7 @@ From the file list in the message created by the Makefile.am configure.ac doc/aregex.3am - packaging/gawk-aregex.spec.in + packaging/gawk-aregex.spec.in test/Makefile.am The appropriate changes are made in the script (`test/Makefile.am` diff --git a/sf_build/aregex.c.patch b/sf_build/aregex.c.patch deleted file mode 100644 index 036be8e..0000000 --- a/sf_build/aregex.c.patch +++ /dev/null @@ -1,18 +0,0 @@ -13a14 -> #include "common.h" -19,23d19 -< // Gawkextlib boilerplate: -< static const gawk_api_t *api; -< static awk_ext_id_t ext_id; -< int plugin_is_GPL_compatible; -< -27c23 -< , struct awk_ext_func *unused) ---- -> API_FINFO_ARG) -214d209 -< -224c219 -< static const char *ext_version = "0.1"; ---- -> static const char *ext_version = PACKAGE_STRING; diff --git a/sf_build/make_sf_version.sh b/sf_build/make_sf_version.sh index 2ea3d64..024e723 100644 --- a/sf_build/make_sf_version.sh +++ b/sf_build/make_sf_version.sh @@ -24,8 +24,10 @@ sed -i 's/AC_GAWK_EXTENSION/AC_GAWK_EXTENSION\n\nAC_CHECK_LIB([tre], [tre_regaex sed -i 's/BuildRequires: gcc/BuildRequires: gcc\nBuildRequires: tre-devel/g' packaging/gawk-aregex.spec.in # Main code -cp -f ../../../aregex.c . -patch -i ../../aregex.c.patch aregex.c +sed -e '/^#define PACKAGE_STRING / , /int plugin_is_GPL_compatible/c\ +#include "common.h"' \ + -e 's|, struct awk_ext_func \*unused|API_FINFO_ARG|' \ + ../../../aregex.c > aregex.c # Man page cp -f ../../../doc/aregex.3am doc @@ -36,7 +38,7 @@ git add webTOC # Test files cp -f ../../../test/aregex.awk test -sed -i 's|./aregex.so|../.libs/aregex.so|g' test/aregex.awk +sed -i 's|./aregex|../.libs/aregex|g' test/aregex.awk # note: the gawkextlib test suite uses gawk with --characters-as-bytes cp -f ../../../test/aregex_b.ok test/aregex.ok @@ -65,7 +67,7 @@ make check # git clone git://git.code.sf.net/u/ctenolophon/gawkextlib u-ctenolophon-gawkextlib # cd u-ctenolophon-gawkextlib/aregex/ # autoreconf -i -# ./configure +# ./configure # make # make check # cd ../.. diff --git a/sf_build/update_sf_version.sh b/sf_build/update_sf_version.sh index 4361ef2..aa311bc 100644 --- a/sf_build/update_sf_version.sh +++ b/sf_build/update_sf_version.sh @@ -4,7 +4,7 @@ # Define new version here: -VERSION=1.0.1 +VERSION=1.1.0 # Get new version of original repo rm -rf u-ctenolophon-gawkextlib @@ -24,8 +24,10 @@ cd aregex/ sed -E -i "s/\ [0-9]+\.[0-9]+\.[0-9]+/ $VERSION/g" configure.ac # Change code files -cp -f ../../../aregex.c . -patch -i ../../aregex.c.patch aregex.c +sed -e '/^#define PACKAGE_STRING / , /int plugin_is_GPL_compatible/c\ +#include "common.h"' \ + -e 's|, struct awk_ext_func \*unused|API_FINFO_ARG|' \ + ../../../aregex.c > aregex.c # Man page cp -f ../../../doc/aregex.3am doc @@ -35,7 +37,7 @@ cp -f ../../webTOC . # Test files cp -f ../../../test/aregex.awk test -sed -i 's|./aregex.so|../.libs/aregex.so|g' test/aregex.awk +sed -i 's|./aregex|../.libs/aregex|g' test/aregex.awk # note: the gawkextlib test suite uses gawk with --characters-as-bytes cp -f ../../../test/aregex_b.ok test/aregex.ok @@ -51,7 +53,7 @@ cp -f ../../SF_README.md README ## Build autoreconf -i -./configure +./configure make make check @@ -67,7 +69,7 @@ make check # git clone git://git.code.sf.net/u/ctenolophon/gawkextlib u-ctenolophon-gawkextlib # cd u-ctenolophon-gawkextlib/aregex/ # autoreconf -i -# ./configure +# ./configure # make # make check # cd ../.. diff --git a/test/aregex.awk b/test/aregex.awk index aa58989..38cf56f 100644 --- a/test/aregex.awk +++ b/test/aregex.awk @@ -1,4 +1,4 @@ -@load "./aregex.so" +@load "./aregex" BEGIN { str = "abcdễfbc" @@ -14,7 +14,7 @@ BEGIN { print "Match sr : " amatch(str, re) print "Match src : " amatch(str, re, 6.1) - print "Match srcb: " amatch(str, re, 6, b), b[1] + print "Match srcb: " amatch(str, re, 6, b), b[1] print "Match srC : " amatch(str, re, cost) print "Match srCb: " amatch(str, re, cost, out) @@ -34,4 +34,7 @@ BEGIN { print "num_ins : " cost["num_ins"] print "num_del : " cost["num_del"] print "num_subst : " cost["num_subst"] + + # check that amatch can handle invalid regex + if(-1 == amatch(str,"^(")) print ERRNO > "/dev/stderr" } diff --git a/test/aregex_b.ok b/test/aregex_b.ok index 72a94bf..adeec9d 100644 --- a/test/aregex_b.ok +++ b/test/aregex_b.ok @@ -1,3 +1,4 @@ +aregex: tre: Missing ')' in /^(/ String : abcdễfbc RE : ^a(bc)d(ễ)(f)$ max_cost : 6