1
0
mirror of https://github.com/stedolan/jq.git synced 2024-05-11 05:55:39 +00:00

regex filters (#432): scan, splits, split, sub, gsub

This commit is contained in:
pkoppstein
2014-07-31 20:32:44 -04:00
committed by Nicolas Williams
parent 0d437e25de
commit a696c6b551
3 changed files with 185 additions and 11 deletions

View File

@@ -974,23 +974,78 @@ static const char* const jq_builtins[] = {
"def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);",
"def flatten(x): x as $x | reduce .[] as $i ([]; if $i | type == \"array\" and $x > 0 then . + ($i | flatten($x-1)) else . + [$i] end);",
"def range(x): x as $x | range(0;$x);",
// regular expressions:
"def match(re; mode): _match_impl(re; mode; false)|.[];",
"def match(val): (val|type) as $vt | if $vt == \"string\" then match(val; null)"
" elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
" elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
"def test(re; mode): _match_impl(re; mode; true);",
"def test(val): (val|type) as $vt | if $vt == \"string\" then test(val; null)"
" elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
// Ex.: "a1" | capture( "(?<x>[a-z*])" ).x => "a"
" elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
"def capture(re; mods): match(re; mods) | reduce ( .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair ({}; . + $pair);",
"def capture(val): (val|type) as $vt | if $vt == \"string\" then capture(val; null)"
" elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
" elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
" elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
" else error( $vt + \" not a string or array\") end;",
"def scan(re):"
" match(re; \"g\")"
" | if (.captures|length > 0)"
" then [ .captures | .[] | .string ]"
" else .string"
" end ;",
//
// If input is an array, then emit a stream of successive subarrays of length n (or less),
// and similarly for strings.
"def nwise(a; n): if a|length <= n then a else a[0:n] , nwise(a[n:]; n) end;",
"def nwise(n): nwise(.; n);",
//
// splits/1 produces a stream; split/1 is retained for backward compatibility.
"def splits(re; flags): . as $s"
// # multiple occurrences of "g" are acceptable
" | [ match(re; \"g\" + flags) | (.offset, .offset + .length) ]"
" | [0] + . +[$s|length]"
" | nwise(2)"
" | $s[.[0]:.[1] ] ;",
"def splits(re): splits(re; null);",
//
// split emits an array for backward compatibility
"def split(re; flags): [ splits(re; flags) ];",
"def split(re): [ splits(re; null) ];",
//
// If s contains capture variables, then create a capture object and pipe it to s
"def sub(re; s):"
" . as $in"
" | [match(re)]"
" | .[0]"
" | . as $r"
// # create the \"capture\" object:
" | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
" ({}; . + $pair)"
" | if . == {} then $in | .[0:$r.offset]+s+.[$r.offset+$r.length:]"
" else (. | s)"
" end ;",
//
// repeated substitution of re (which may contain named captures)
"def gsub(re; s):"
// # _stredit(edits;s) - s is the \"to\" string, which might contain capture variables,
// # so if an edit contains captures, then create the capture object and pipe it to s
" def _stredit(edits; s):"
" if (edits|length) == 0 then ."
" else . as $in"
" | (edits|length -1) as $l"
" | (edits[$l]) as $edit"
// # create the \"capture\" object:
" | ($edit | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
" ({}; . + $pair) )"
" | if . == {} then $in | .[0:$edit.offset]+s+.[$edit.offset+$edit.length:] | _stredit(edits[0:$l]; s)"
" else (if $l == 0 then \"\" else ($in | _stredit(edits[0:$l]; s)) end) + (. | s)"
" end"
" end ;"
" [match(re;\"g\")] as $edits | _stredit($edits; s) ;",
//#######################################################################
// range/3, with a `by` expression argument
"def range(init; upto; by): "
" init as $init |"

View File

@@ -1721,6 +1721,91 @@ sections:
- program: 'capture("(?<a>[a-z]+)-(?<n>[0-9]+)")'
input: '"xyzzy-14"'
output: '{ "a": "xyzzy", "n": "14" }''
- title: "`scan(regex)`, `scan(regex; flags)`"
body: |
Emit a stream of the non-overlapping substrings of the input
that match the regex in accordance with the flags, if any
have been specified. If there is no match, the stream is empty.
To capture all the matches for each input string, use the idiom
[ expr ], e.g. [ scan(regex) ].
example:
- program: 'scan("c")'
input: '"abcdefabc"'
output: '"c"'
'"c"'
- program: 'scan("b")'
input: ("", "")
output: '[]'
'[]"'
- title: "`split(regex)`, split(regex; flags)`"
body: |
For backwards compatibility, `split` emits an array of the strings
corresponding to the successive segments of the input string after it
has been split at the boundaries defined by the regex and any
specified flags. The substrings corresponding to the boundaries
themselves are excluded. If regex is the empty string, then the first
match will be the empty string.
`split(regex)` can be thought of as a wrapper around `splits(regex)`,
and similarly for `split(regex; flags)`.
example:
- program: 'split(", *")'
input: '"ab,cd, ef"`
output: '["ab","cd","ef"]'
- title: "`splits(regex)`, splits(regex; flags)`"
body: |
These provide the same results as their `split` counterparts,
but as a stream instead of an array.
example:
- program: 'splits(", *")'
input: '("ab,cd", "ef, gh")`
output:
'"ab"'
'"cd"'
'"ef"'
'"gh"'
- title: "`sub(regex; tostring)`"
body: |
Emit the string obtained by replacing the first match of regex in the
input string with `tostring`, after interpolation. `tostring` should
be a jq string, and may contain references to named captures. The
named captures are, in effect, presented as a JSON object (as
constructed by `capture`) to `tostring`, so a reference to a captured
variable named "x" would take the form: "\(.x)".
example:
- program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
input: '"123abc456"'
output: '"ZabcZabc"'
- title: "`gsub(regex; string)`"
body: |
`gsub` is like `sub` but all the non-overlapping occurrences of the regex are
replaced by the string, after interpolation.
example:
- program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
input: '"Abcabc"'
output: '"+A-+a-"'
- title: Advanced features
body: |

View File

@@ -820,6 +820,40 @@ capture("(?<a>[a-z]+)-(?<n>[0-9]+)")
"xyzzy-14"
{"a":"xyzzy","n":"14"}
# jq-coded utilities built on match:
#
# The second element in these tests' inputs tests the case where the
# fromstring matches both the head and tail of the string
[.[] | sub(", "; ":")]
["a,b, c, d, e,f", ", a,b, c, d, e,f, "]
["a,b:c, d, e,f",":a,b, c, d, e,f, "]
, #2 [", ",", ",", "],["a,b","c","d","e,f"]], #3 [[":a,b, c, d, e,f,"],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]
[.[] | gsub(", "; ":")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
["a,b:c:d:e,f",":a,b:c:d:e,f:"]
[.[] | scan(", ")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[.[] | split(", ")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
########################
[.[]|[[sub(", *";":")], [gsub(", *";":")], [scan(", *")], split(", *")]]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[[["a:b, c, d, e,f"],["a:b:c:d:e:f"],[",",", ",", ",", ",","],["a","b","c","d","e","f"]],[[":a,b, c, d, e,f, "],[":a:b:c:d:e:f:"],[", ",",",", ",", ",", ",",",", "],["","a","b","c","d","e","f",""]]]
[.[]|[[sub(", +";":")], [gsub(", +";":")], [scan(", +")], split(", +")]]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[[["a,b:c, d, e,f"],["a,b:c:d:e,f"],[", ",", ",", "],["a,b","c","d","e,f"]],[[":a,b, c, d, e,f, "],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]
# reference to named captures
gsub("(?<x>.)[^a]*"; "+\(.x)-")
"Abcabc"
"+A-+a-"
[.[]|ltrimstr("foo")]
["fo", "foo", "barfoo", "foobar", "afoo"]
["fo","","barfoo","bar","afoo"]