diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index ba2486c10c..f2000f6ffa 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -1772,6 +1772,25 @@ sections: input: '["fo", "foo", "barfoo", "foobar", "foob"]' output: ['["fo","","bar","foobar","foob"]'] + - title: "`trim`, `ltrim`, `rtrim`" + body: | + + `trim` trims both leading and trailing whitespace. + + `ltrim` trims only leading (left side) whitespace. + + `rtrim` trims only trailing (right side) whitespace. + + Whitespace characters are the usual `" "`, `"\n"` `"\t"`, `"\r"` + and also all characters in the Unicode character database with the + whitespace property. Note that what considers whitespace might + change in the future. + + examples: + - program: 'trim, ltrim, rtrim' + input: '" abc "' + output: ['"abc"', '"abc "', '" abc"'] + - title: "`explode`" body: | diff --git a/jq.1.prebuilt b/jq.1.prebuilt index 32221515d8..efa5aa2f34 100644 --- a/jq.1.prebuilt +++ b/jq.1.prebuilt @@ -1930,6 +1930,30 @@ jq \'[\.[]|rtrimstr("foo")]\' . .IP "" 0 . +.SS "trim, ltrim, rtrim" +\fBtrim\fR trims both leading and trailing whitespace\. +. +.P +\fBltrim\fR trims only leading (left side) whitespace\. +. +.P +\fBrtrim\fR trims only trailing (right side) whitespace\. +. +.P +Whitespace characters are the usual \fB" "\fR, \fB"\en"\fR \fB"\et"\fR, \fB"\er"\fR and also all characters in the Unicode character database with the whitespace property\. Note that what considers whitespace might change in the future\. +. +.IP "" 4 +. +.nf + +jq \'trim, ltrim, rtrim\' + " abc " +=> "abc", "abc ", " abc" +. +.fi +. +.IP "" 0 +. .SS "explode" Converts an input string into an array of the string\'s codepoint numbers\. . diff --git a/src/builtin.c b/src/builtin.c index 5f24cfb858..e93ac321f5 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -1197,6 +1197,58 @@ static jv f_string_indexes(jq_state *jq, jv a, jv b) { return jv_string_indexes(a, b); } +enum trim_op { + TRIM_LEFT = 1 << 0, + TRIM_RIGHT = 1 << 1 +}; + +static jv string_trim(jv a, int op) { + if (jv_get_kind(a) != JV_KIND_STRING) { + return ret_error(a, jv_string("trim input must be a string")); + } + + int len = jv_string_length_bytes(jv_copy(a)); + const char *start = jv_string_value(a); + const char *trim_start = start; + const char *end = trim_start + len; + const char *trim_end = end; + int c; + + if (op & TRIM_LEFT) { + for (;;) { + const char *ns = jvp_utf8_next(trim_start, end, &c); + if (!ns || !jvp_codepoint_is_whitespace(c)) + break; + trim_start = ns; + } + } + + // make sure not empty string or start trim has trimmed everything + if ((op & TRIM_RIGHT) && trim_end > trim_start) { + for (;;) { + const char *ns = jvp_utf8_backtrack(trim_end-1, trim_start, NULL); + jvp_utf8_next(ns, trim_end, &c); + if (!jvp_codepoint_is_whitespace(c)) + break; + trim_end = ns; + if (ns == trim_start) + break; + } + } + + // no new string needed if there is nothing to trim + if (trim_start == start && trim_end == end) + return a; + + jv ts = jv_string_sized(trim_start, trim_end - trim_start); + jv_free(a); + return ts; +} + +static jv f_string_trim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT | TRIM_RIGHT); } +static jv f_string_ltrim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT); } +static jv f_string_rtrim(jq_state *jq, jv a) { return string_trim(a, TRIM_RIGHT); } + static jv f_string_implode(jq_state *jq, jv a) { if (jv_get_kind(a) != JV_KIND_ARRAY) { return ret_error(a, jv_string("implode input must be an array")); @@ -1721,6 +1773,9 @@ BINOPS {f_string_explode, "explode", 1}, {f_string_implode, "implode", 1}, {f_string_indexes, "_strindices", 2}, + {f_string_trim, "trim", 1}, + {f_string_ltrim, "ltrim", 1}, + {f_string_rtrim, "rtrim", 1}, {f_setpath, "setpath", 3}, // FIXME typechecking {f_getpath, "getpath", 2}, {f_delpaths, "delpaths", 2}, diff --git a/src/jv_unicode.c b/src/jv_unicode.c index d197349f48..5a7623151d 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -118,3 +118,21 @@ int jvp_utf8_encode(int codepoint, char* out) { assert(out - start == jvp_utf8_encode_length(codepoint)); return out - start; } + +// characters with White_Space property in: +// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +int jvp_codepoint_is_whitespace(int c) { + return + (c >= 0x0009 && c <= 0x000D) || // .. + c == 0x0020 || // SPACE + c == 0x0085 || // + c == 0x00A0 || // NO-BREAK SPACE + c == 0x1680 || // OGHAM SPACE MARK + (c >= 0x2000 && c <= 0x200A) || // EN QUAD..HAIR SPACE + c == 0x2028 || // LINE SEPARATOR + c == 0x2029 || // PARAGRAPH SEPARATOR + c == 0x202F || // NARROW NO-BREAK SPACE + c == 0x205F || // MEDIUM MATHEMATICAL SPACE + c == 0x3000 // IDEOGRAPHIC SPACE + ; +} diff --git a/src/jv_unicode.h b/src/jv_unicode.h index 558721a8fd..0e5e9557f7 100644 --- a/src/jv_unicode.h +++ b/src/jv_unicode.h @@ -9,4 +9,6 @@ int jvp_utf8_decode_length(char startchar); int jvp_utf8_encode_length(int codepoint); int jvp_utf8_encode(int codepoint, char* out); + +int jvp_codepoint_is_whitespace(int c); #endif diff --git a/tests/jq.test b/tests/jq.test index 584ab2b6ab..eabf836fa0 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -1334,6 +1334,26 @@ split("") "xababababax" [1,7,[1,3,5,7]] +# trim +# \u000b is vertical tab (\v not supported by json) +map(trim), map(ltrim), map(rtrim) +[" \n\t\r\f\u000b", ""," ", "a", " a ", "abc", " abc ", " abc", "abc "] +["", "", "", "a", "a", "abc", "abc", "abc", "abc"] +["", "", "", "a", "a ", "abc", "abc ", "abc", "abc "] +["", "", "", "a", " a", "abc", " abc", " abc", "abc"] + +trim, ltrim, rtrim +"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000" +"abc" +"abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000" +"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc" + +try trim catch ., try ltrim catch ., try rtrim catch . +123 +"trim input must be a string" +"trim input must be a string" +"trim input must be a string" + indices(1) [0,1,1,2,3,4,1,5] [1,2,6] diff --git a/tests/man.test b/tests/man.test index 07938cd586..31ae3bf2f5 100644 --- a/tests/man.test +++ b/tests/man.test @@ -602,6 +602,12 @@ combinations(2) ["fo", "foo", "barfoo", "foobar", "foob"] ["fo","","bar","foobar","foob"] +trim, ltrim, rtrim +" abc " +"abc" +"abc " +" abc" + explode "foobar" [102,111,111,98,97,114]