Add trim/0, ltrim/0 and rtrim/0 that trims leading and trailing whitespace (#3056)

2024-05-11 05:55:39 +00:00 · 2024-03-20 11:04:17 +01:00
parent 81f4f883ac
commit be437ec049
7 changed files with 144 additions and 0 deletions
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@ -1772,6 +1772,25 @@ sections:
            input: '["fo", "foo", "barfoo", "foobar", "foob"]'
            output: ['["fo","","bar","foobar","foob"]']

+      - title: "`trim`, `ltrim`, `rtrim`"
+        body: |
+
+          `trim` trims both leading and trailing whitespace.
+
+          `ltrim` trims only leading (left side) whitespace.
+
+          `rtrim` trims only trailing (right side) whitespace.
+
+          Whitespace characters are the usual `" "`, `"\n"` `"\t"`, `"\r"`
+          and also all characters in the Unicode character database with the
+          whitespace property. Note that what considers whitespace might
+          change in the future.
+
+        examples:
+          - program: 'trim, ltrim, rtrim'
+            input: '" abc "'
+            output: ['"abc"', '"abc "', '" abc"']
+
      - title: "`explode`"
        body: |

--- a/jq.1.prebuilt
+++ b/jq.1.prebuilt
@ -1930,6 +1930,30 @@ jq \'[\.[]|rtrimstr("foo")]\'
 .
 .IP "" 0
 .
+.SS "trim, ltrim, rtrim"
+\fBtrim\fR trims both leading and trailing whitespace\.
+.
+.P
+\fBltrim\fR trims only leading (left side) whitespace\.
+.
+.P
+\fBrtrim\fR trims only trailing (right side) whitespace\.
+.
+.P
+Whitespace characters are the usual \fB" "\fR, \fB"\en"\fR \fB"\et"\fR, \fB"\er"\fR and also all characters in the Unicode character database with the whitespace property\. Note that what considers whitespace might change in the future\.
+.
+.IP "" 4
+.
+.nf
+
+jq \'trim, ltrim, rtrim\'
+   " abc "
+=> "abc", "abc ", " abc"
+.
+.fi
+.
+.IP "" 0
+.
 .SS "explode"
 Converts an input string into an array of the string\'s codepoint numbers\.
 .
--- a/src/builtin.c
+++ b/src/builtin.c
@ -1197,6 +1197,58 @@ static jv f_string_indexes(jq_state *jq, jv a, jv b) {
  return jv_string_indexes(a, b);
 }

+enum trim_op {
+  TRIM_LEFT  = 1 << 0,
+  TRIM_RIGHT = 1 << 1
+};
+
+static jv string_trim(jv a, int op) {
+  if (jv_get_kind(a) != JV_KIND_STRING) {
+    return ret_error(a, jv_string("trim input must be a string"));
+  }
+
+  int len = jv_string_length_bytes(jv_copy(a));
+  const char *start = jv_string_value(a);
+  const char *trim_start = start;
+  const char *end = trim_start + len;
+  const char *trim_end = end;
+  int c;
+
+  if (op & TRIM_LEFT) {
+    for (;;) {
+      const char *ns = jvp_utf8_next(trim_start, end, &c);
+      if (!ns || !jvp_codepoint_is_whitespace(c))
+        break;
+      trim_start = ns;
+    }
+  }
+
+  // make sure not empty string or start trim has trimmed everything
+  if ((op & TRIM_RIGHT) && trim_end > trim_start) {
+    for (;;) {
+      const char *ns = jvp_utf8_backtrack(trim_end-1, trim_start, NULL);
+      jvp_utf8_next(ns, trim_end, &c);
+      if (!jvp_codepoint_is_whitespace(c))
+        break;
+      trim_end = ns;
+      if (ns == trim_start)
+        break;
+    }
+  }
+
+  // no new string needed if there is nothing to trim
+  if (trim_start == start && trim_end == end)
+    return a;
+
+  jv ts = jv_string_sized(trim_start, trim_end - trim_start);
+  jv_free(a);
+  return ts;
+}
+
+static jv f_string_trim(jq_state *jq, jv a)  { return string_trim(a, TRIM_LEFT | TRIM_RIGHT); }
+static jv f_string_ltrim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT); }
+static jv f_string_rtrim(jq_state *jq, jv a) { return string_trim(a, TRIM_RIGHT); }
+
 static jv f_string_implode(jq_state *jq, jv a) {
  if (jv_get_kind(a) != JV_KIND_ARRAY) {
    return ret_error(a, jv_string("implode input must be an array"));
@ -1721,6 +1773,9 @@ BINOPS
  {f_string_explode, "explode", 1},
  {f_string_implode, "implode", 1},
  {f_string_indexes, "_strindices", 2},
+  {f_string_trim, "trim", 1},
+  {f_string_ltrim, "ltrim", 1},
+  {f_string_rtrim, "rtrim", 1},
  {f_setpath, "setpath", 3}, // FIXME typechecking
  {f_getpath, "getpath", 2},
  {f_delpaths, "delpaths", 2},
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@ -118,3 +118,21 @@ int jvp_utf8_encode(int codepoint, char* out) {
  assert(out - start == jvp_utf8_encode_length(codepoint));
  return out - start;
 }
+
+// characters with White_Space property in:
+// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+int jvp_codepoint_is_whitespace(int c) {
+  return
+    (c >= 0x0009 && c <= 0x000D) || // <control-0009>..<control-000D>
+    c == 0x0020                  || // SPACE
+    c == 0x0085                  || // <control-0085>
+    c == 0x00A0                  || // NO-BREAK SPACE
+    c == 0x1680                  || // OGHAM SPACE MARK
+    (c >= 0x2000 && c <= 0x200A) || // EN QUAD..HAIR SPACE
+    c == 0x2028                  || // LINE SEPARATOR
+    c == 0x2029                  || // PARAGRAPH SEPARATOR
+    c == 0x202F                  || // NARROW NO-BREAK SPACE
+    c == 0x205F                  || // MEDIUM MATHEMATICAL SPACE
+    c == 0x3000                     // IDEOGRAPHIC SPACE
+    ;
+}
--- a/src/jv_unicode.h
+++ b/src/jv_unicode.h
@ -9,4 +9,6 @@ int jvp_utf8_decode_length(char startchar);

 int jvp_utf8_encode_length(int codepoint);
 int jvp_utf8_encode(int codepoint, char* out);
+
+int jvp_codepoint_is_whitespace(int c);
 #endif
--- a/tests/jq.test
+++ b/tests/jq.test
@ -1334,6 +1334,26 @@ split("")
 "xababababax"
 [1,7,[1,3,5,7]]

+# trim
+# \u000b is vertical tab (\v not supported by json)
+map(trim), map(ltrim), map(rtrim)
+[" \n\t\r\f\u000b", "","  ", "a", " a ", "abc", "  abc  ", "  abc", "abc  "]
+["", "", "", "a", "a", "abc", "abc", "abc", "abc"]
+["", "", "", "a", "a ", "abc", "abc  ", "abc", "abc  "]
+["", "", "", "a", " a", "abc", "  abc", "  abc", "abc"]
+
+trim, ltrim, rtrim
+"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
+"abc"
+"abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
+"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc"
+
+try trim catch ., try ltrim catch ., try rtrim catch .
+123
+"trim input must be a string"
+"trim input must be a string"
+"trim input must be a string"
+
 indices(1)
 [0,1,1,2,3,4,1,5]
 [1,2,6]
--- a/tests/man.test
+++ b/tests/man.test
@ -602,6 +602,12 @@ combinations(2)
 ["fo", "foo", "barfoo", "foobar", "foob"]
 ["fo","","bar","foobar","foob"]

+trim, ltrim, rtrim
+" abc "
+"abc"
+"abc "
+" abc"
+
 explode
 "foobar"
 [102,111,111,98,97,114]