1
0
mirror of https://github.com/stedolan/jq.git synced 2024-05-11 05:55:39 +00:00

Add trim/0, ltrim/0 and rtrim/0 that trims leading and trailing whitespace (#3056)

This commit is contained in:
Mattias Wadman
2024-03-20 11:04:17 +01:00
committed by GitHub
parent 81f4f883ac
commit be437ec049
7 changed files with 144 additions and 0 deletions

View File

@ -1772,6 +1772,25 @@ sections:
input: '["fo", "foo", "barfoo", "foobar", "foob"]'
output: ['["fo","","bar","foobar","foob"]']
- title: "`trim`, `ltrim`, `rtrim`"
body: |
`trim` trims both leading and trailing whitespace.
`ltrim` trims only leading (left side) whitespace.
`rtrim` trims only trailing (right side) whitespace.
Whitespace characters are the usual `" "`, `"\n"` `"\t"`, `"\r"`
and also all characters in the Unicode character database with the
whitespace property. Note that what considers whitespace might
change in the future.
examples:
- program: 'trim, ltrim, rtrim'
input: '" abc "'
output: ['"abc"', '"abc "', '" abc"']
- title: "`explode`"
body: |

24
jq.1.prebuilt generated
View File

@ -1930,6 +1930,30 @@ jq \'[\.[]|rtrimstr("foo")]\'
.
.IP "" 0
.
.SS "trim, ltrim, rtrim"
\fBtrim\fR trims both leading and trailing whitespace\.
.
.P
\fBltrim\fR trims only leading (left side) whitespace\.
.
.P
\fBrtrim\fR trims only trailing (right side) whitespace\.
.
.P
Whitespace characters are the usual \fB" "\fR, \fB"\en"\fR \fB"\et"\fR, \fB"\er"\fR and also all characters in the Unicode character database with the whitespace property\. Note that what considers whitespace might change in the future\.
.
.IP "" 4
.
.nf
jq \'trim, ltrim, rtrim\'
" abc "
=> "abc", "abc ", " abc"
.
.fi
.
.IP "" 0
.
.SS "explode"
Converts an input string into an array of the string\'s codepoint numbers\.
.

View File

@ -1197,6 +1197,58 @@ static jv f_string_indexes(jq_state *jq, jv a, jv b) {
return jv_string_indexes(a, b);
}
enum trim_op {
TRIM_LEFT = 1 << 0,
TRIM_RIGHT = 1 << 1
};
static jv string_trim(jv a, int op) {
if (jv_get_kind(a) != JV_KIND_STRING) {
return ret_error(a, jv_string("trim input must be a string"));
}
int len = jv_string_length_bytes(jv_copy(a));
const char *start = jv_string_value(a);
const char *trim_start = start;
const char *end = trim_start + len;
const char *trim_end = end;
int c;
if (op & TRIM_LEFT) {
for (;;) {
const char *ns = jvp_utf8_next(trim_start, end, &c);
if (!ns || !jvp_codepoint_is_whitespace(c))
break;
trim_start = ns;
}
}
// make sure not empty string or start trim has trimmed everything
if ((op & TRIM_RIGHT) && trim_end > trim_start) {
for (;;) {
const char *ns = jvp_utf8_backtrack(trim_end-1, trim_start, NULL);
jvp_utf8_next(ns, trim_end, &c);
if (!jvp_codepoint_is_whitespace(c))
break;
trim_end = ns;
if (ns == trim_start)
break;
}
}
// no new string needed if there is nothing to trim
if (trim_start == start && trim_end == end)
return a;
jv ts = jv_string_sized(trim_start, trim_end - trim_start);
jv_free(a);
return ts;
}
static jv f_string_trim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT | TRIM_RIGHT); }
static jv f_string_ltrim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT); }
static jv f_string_rtrim(jq_state *jq, jv a) { return string_trim(a, TRIM_RIGHT); }
static jv f_string_implode(jq_state *jq, jv a) {
if (jv_get_kind(a) != JV_KIND_ARRAY) {
return ret_error(a, jv_string("implode input must be an array"));
@ -1721,6 +1773,9 @@ BINOPS
{f_string_explode, "explode", 1},
{f_string_implode, "implode", 1},
{f_string_indexes, "_strindices", 2},
{f_string_trim, "trim", 1},
{f_string_ltrim, "ltrim", 1},
{f_string_rtrim, "rtrim", 1},
{f_setpath, "setpath", 3}, // FIXME typechecking
{f_getpath, "getpath", 2},
{f_delpaths, "delpaths", 2},

View File

@ -118,3 +118,21 @@ int jvp_utf8_encode(int codepoint, char* out) {
assert(out - start == jvp_utf8_encode_length(codepoint));
return out - start;
}
// characters with White_Space property in:
// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
int jvp_codepoint_is_whitespace(int c) {
return
(c >= 0x0009 && c <= 0x000D) || // <control-0009>..<control-000D>
c == 0x0020 || // SPACE
c == 0x0085 || // <control-0085>
c == 0x00A0 || // NO-BREAK SPACE
c == 0x1680 || // OGHAM SPACE MARK
(c >= 0x2000 && c <= 0x200A) || // EN QUAD..HAIR SPACE
c == 0x2028 || // LINE SEPARATOR
c == 0x2029 || // PARAGRAPH SEPARATOR
c == 0x202F || // NARROW NO-BREAK SPACE
c == 0x205F || // MEDIUM MATHEMATICAL SPACE
c == 0x3000 // IDEOGRAPHIC SPACE
;
}

View File

@ -9,4 +9,6 @@ int jvp_utf8_decode_length(char startchar);
int jvp_utf8_encode_length(int codepoint);
int jvp_utf8_encode(int codepoint, char* out);
int jvp_codepoint_is_whitespace(int c);
#endif

View File

@ -1334,6 +1334,26 @@ split("")
"xababababax"
[1,7,[1,3,5,7]]
# trim
# \u000b is vertical tab (\v not supported by json)
map(trim), map(ltrim), map(rtrim)
[" \n\t\r\f\u000b", ""," ", "a", " a ", "abc", " abc ", " abc", "abc "]
["", "", "", "a", "a", "abc", "abc", "abc", "abc"]
["", "", "", "a", "a ", "abc", "abc ", "abc", "abc "]
["", "", "", "a", " a", "abc", " abc", " abc", "abc"]
trim, ltrim, rtrim
"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
"abc"
"abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc"
try trim catch ., try ltrim catch ., try rtrim catch .
123
"trim input must be a string"
"trim input must be a string"
"trim input must be a string"
indices(1)
[0,1,1,2,3,4,1,5]
[1,2,6]

6
tests/man.test generated
View File

@ -602,6 +602,12 @@ combinations(2)
["fo", "foo", "barfoo", "foobar", "foob"]
["fo","","bar","foobar","foob"]
trim, ltrim, rtrim
" abc "
"abc"
"abc "
" abc"
explode
"foobar"
[102,111,111,98,97,114]