mirror of
https://github.com/stedolan/jq.git
synced 2024-05-11 05:55:39 +00:00
regex filters (#432): scan, splits, split, sub, gsub
This commit is contained in:
committed by
Nicolas Williams
parent
0d437e25de
commit
a696c6b551
77
builtin.c
77
builtin.c
@@ -974,23 +974,78 @@ static const char* const jq_builtins[] = {
|
||||
"def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);",
|
||||
"def flatten(x): x as $x | reduce .[] as $i ([]; if $i | type == \"array\" and $x > 0 then . + ($i | flatten($x-1)) else . + [$i] end);",
|
||||
"def range(x): x as $x | range(0;$x);",
|
||||
// regular expressions:
|
||||
"def match(re; mode): _match_impl(re; mode; false)|.[];",
|
||||
"def match(val): (val|type) as $vt | if $vt == \"string\" then match(val; null)"
|
||||
" elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
|
||||
" elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
|
||||
" else error( $vt + \" not a string or array\") end;",
|
||||
" elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
|
||||
" elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
|
||||
" else error( $vt + \" not a string or array\") end;",
|
||||
"def test(re; mode): _match_impl(re; mode; true);",
|
||||
"def test(val): (val|type) as $vt | if $vt == \"string\" then test(val; null)"
|
||||
" elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
|
||||
" elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
|
||||
" else error( $vt + \" not a string or array\") end;",
|
||||
// Ex.: "a1" | capture( "(?<x>[a-z*])" ).x => "a"
|
||||
" elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
|
||||
" elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
|
||||
" else error( $vt + \" not a string or array\") end;",
|
||||
"def capture(re; mods): match(re; mods) | reduce ( .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair ({}; . + $pair);",
|
||||
"def capture(val): (val|type) as $vt | if $vt == \"string\" then capture(val; null)"
|
||||
" elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
|
||||
" elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
|
||||
" else error( $vt + \" not a string or array\") end;",
|
||||
" elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
|
||||
" elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
|
||||
" else error( $vt + \" not a string or array\") end;",
|
||||
"def scan(re):"
|
||||
" match(re; \"g\")"
|
||||
" | if (.captures|length > 0)"
|
||||
" then [ .captures | .[] | .string ]"
|
||||
" else .string"
|
||||
" end ;",
|
||||
//
|
||||
// If input is an array, then emit a stream of successive subarrays of length n (or less),
|
||||
// and similarly for strings.
|
||||
"def nwise(a; n): if a|length <= n then a else a[0:n] , nwise(a[n:]; n) end;",
|
||||
"def nwise(n): nwise(.; n);",
|
||||
//
|
||||
// splits/1 produces a stream; split/1 is retained for backward compatibility.
|
||||
"def splits(re; flags): . as $s"
|
||||
// # multiple occurrences of "g" are acceptable
|
||||
" | [ match(re; \"g\" + flags) | (.offset, .offset + .length) ]"
|
||||
" | [0] + . +[$s|length]"
|
||||
" | nwise(2)"
|
||||
" | $s[.[0]:.[1] ] ;",
|
||||
"def splits(re): splits(re; null);",
|
||||
//
|
||||
// split emits an array for backward compatibility
|
||||
"def split(re; flags): [ splits(re; flags) ];",
|
||||
"def split(re): [ splits(re; null) ];",
|
||||
//
|
||||
// If s contains capture variables, then create a capture object and pipe it to s
|
||||
"def sub(re; s):"
|
||||
" . as $in"
|
||||
" | [match(re)]"
|
||||
" | .[0]"
|
||||
" | . as $r"
|
||||
// # create the \"capture\" object:
|
||||
" | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
|
||||
" ({}; . + $pair)"
|
||||
" | if . == {} then $in | .[0:$r.offset]+s+.[$r.offset+$r.length:]"
|
||||
" else (. | s)"
|
||||
" end ;",
|
||||
//
|
||||
// repeated substitution of re (which may contain named captures)
|
||||
"def gsub(re; s):"
|
||||
// # _stredit(edits;s) - s is the \"to\" string, which might contain capture variables,
|
||||
// # so if an edit contains captures, then create the capture object and pipe it to s
|
||||
" def _stredit(edits; s):"
|
||||
" if (edits|length) == 0 then ."
|
||||
" else . as $in"
|
||||
" | (edits|length -1) as $l"
|
||||
" | (edits[$l]) as $edit"
|
||||
// # create the \"capture\" object:
|
||||
" | ($edit | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
|
||||
" ({}; . + $pair) )"
|
||||
" | if . == {} then $in | .[0:$edit.offset]+s+.[$edit.offset+$edit.length:] | _stredit(edits[0:$l]; s)"
|
||||
" else (if $l == 0 then \"\" else ($in | _stredit(edits[0:$l]; s)) end) + (. | s)"
|
||||
" end"
|
||||
" end ;"
|
||||
" [match(re;\"g\")] as $edits | _stredit($edits; s) ;",
|
||||
|
||||
//#######################################################################
|
||||
// range/3, with a `by` expression argument
|
||||
"def range(init; upto; by): "
|
||||
" init as $init |"
|
||||
|
@@ -1721,6 +1721,91 @@ sections:
|
||||
- program: 'capture("(?<a>[a-z]+)-(?<n>[0-9]+)")'
|
||||
input: '"xyzzy-14"'
|
||||
output: '{ "a": "xyzzy", "n": "14" }''
|
||||
|
||||
- title: "`scan(regex)`, `scan(regex; flags)`"
|
||||
body: |
|
||||
|
||||
Emit a stream of the non-overlapping substrings of the input
|
||||
that match the regex in accordance with the flags, if any
|
||||
have been specified. If there is no match, the stream is empty.
|
||||
To capture all the matches for each input string, use the idiom
|
||||
[ expr ], e.g. [ scan(regex) ].
|
||||
|
||||
example:
|
||||
- program: 'scan("c")'
|
||||
input: '"abcdefabc"'
|
||||
output: '"c"'
|
||||
'"c"'
|
||||
|
||||
- program: 'scan("b")'
|
||||
input: ("", "")
|
||||
output: '[]'
|
||||
'[]"'
|
||||
|
||||
- title: "`split(regex)`, split(regex; flags)`"
|
||||
body: |
|
||||
|
||||
For backwards compatibility, `split` emits an array of the strings
|
||||
corresponding to the successive segments of the input string after it
|
||||
has been split at the boundaries defined by the regex and any
|
||||
specified flags. The substrings corresponding to the boundaries
|
||||
themselves are excluded. If regex is the empty string, then the first
|
||||
match will be the empty string.
|
||||
|
||||
`split(regex)` can be thought of as a wrapper around `splits(regex)`,
|
||||
and similarly for `split(regex; flags)`.
|
||||
|
||||
example:
|
||||
- program: 'split(", *")'
|
||||
input: '"ab,cd, ef"`
|
||||
output: '["ab","cd","ef"]'
|
||||
|
||||
|
||||
- title: "`splits(regex)`, splits(regex; flags)`"
|
||||
body: |
|
||||
|
||||
These provide the same results as their `split` counterparts,
|
||||
but as a stream instead of an array.
|
||||
|
||||
example:
|
||||
- program: 'splits(", *")'
|
||||
input: '("ab,cd", "ef, gh")`
|
||||
output:
|
||||
'"ab"'
|
||||
'"cd"'
|
||||
'"ef"'
|
||||
'"gh"'
|
||||
|
||||
- title: "`sub(regex; tostring)`"
|
||||
|
||||
body: |
|
||||
|
||||
Emit the string obtained by replacing the first match of regex in the
|
||||
input string with `tostring`, after interpolation. `tostring` should
|
||||
be a jq string, and may contain references to named captures. The
|
||||
named captures are, in effect, presented as a JSON object (as
|
||||
constructed by `capture`) to `tostring`, so a reference to a captured
|
||||
variable named "x" would take the form: "\(.x)".
|
||||
|
||||
example:
|
||||
- program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
|
||||
input: '"123abc456"'
|
||||
output: '"ZabcZabc"'
|
||||
|
||||
|
||||
- title: "`gsub(regex; string)`"
|
||||
|
||||
body: |
|
||||
|
||||
`gsub` is like `sub` but all the non-overlapping occurrences of the regex are
|
||||
replaced by the string, after interpolation.
|
||||
|
||||
example:
|
||||
- program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
|
||||
|
||||
input: '"Abcabc"'
|
||||
output: '"+A-+a-"'
|
||||
|
||||
|
||||
- title: Advanced features
|
||||
body: |
|
||||
|
@@ -820,6 +820,40 @@ capture("(?<a>[a-z]+)-(?<n>[0-9]+)")
|
||||
"xyzzy-14"
|
||||
{"a":"xyzzy","n":"14"}
|
||||
|
||||
|
||||
# jq-coded utilities built on match:
|
||||
#
|
||||
# The second element in these tests' inputs tests the case where the
|
||||
# fromstring matches both the head and tail of the string
|
||||
[.[] | sub(", "; ":")]
|
||||
["a,b, c, d, e,f", ", a,b, c, d, e,f, "]
|
||||
["a,b:c, d, e,f",":a,b, c, d, e,f, "]
|
||||
, #2 [", ",", ",", "],["a,b","c","d","e,f"]], #3 [[":a,b, c, d, e,f,"],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]
|
||||
|
||||
[.[] | gsub(", "; ":")]
|
||||
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
|
||||
["a,b:c:d:e,f",":a,b:c:d:e,f:"]
|
||||
|
||||
[.[] | scan(", ")]
|
||||
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
|
||||
|
||||
[.[] | split(", ")]
|
||||
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
|
||||
|
||||
########################
|
||||
[.[]|[[sub(", *";":")], [gsub(", *";":")], [scan(", *")], split(", *")]]
|
||||
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
|
||||
[[["a:b, c, d, e,f"],["a:b:c:d:e:f"],[",",", ",", ",", ",","],["a","b","c","d","e","f"]],[[":a,b, c, d, e,f, "],[":a:b:c:d:e:f:"],[", ",",",", ",", ",", ",",",", "],["","a","b","c","d","e","f",""]]]
|
||||
|
||||
[.[]|[[sub(", +";":")], [gsub(", +";":")], [scan(", +")], split(", +")]]
|
||||
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
|
||||
[[["a,b:c, d, e,f"],["a,b:c:d:e,f"],[", ",", ",", "],["a,b","c","d","e,f"]],[[":a,b, c, d, e,f, "],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]
|
||||
|
||||
# reference to named captures
|
||||
gsub("(?<x>.)[^a]*"; "+\(.x)-")
|
||||
"Abcabc"
|
||||
"+A-+a-"
|
||||
|
||||
[.[]|ltrimstr("foo")]
|
||||
["fo", "foo", "barfoo", "foobar", "afoo"]
|
||||
["fo","","barfoo","bar","afoo"]
|
||||
|
Reference in New Issue
Block a user