1
0
mirror of https://github.com/stedolan/jq.git synced 2024-05-11 05:55:39 +00:00
stedolan-jq/tests/onig.test
pkoppstein 83f375cc83 Revamp sub/3 to resolve most issues with gsub (and sub with "g") (#2641)
The primary purpose of this commit is to rectify most problems with
`gsub` (and also `sub` with the `g` option), in particular fix #1425 ('\b'),
fix #2354 (lookahead), and fix #2532 (regex == `"^(?!cd ).*$|^cd "`).

This commit also partly resolves #2148 and resolves #1206 in that
`gsub` no longer loops infinitely; however, because the new `gsub`
depends critically on `match/2`, the behavior when regex == `""` is
sometimes non-standard.

The documentation has been updated to reflect the fact that `sub`
and `gsub` are intended to be regular in the second argument.

Also, `_nwise/1` has been tweaked to take advantage of TCO.
2023-07-04 07:46:29 +09:00

164 lines
3.4 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# match builtin
[match("( )*"; "g")]
"abc"
[{"offset":0, "length":0, "string":"", "captures":[]},{"offset":1, "length":0, "string":"", "captures":[]},{"offset":2, "length":0, "string":"", "captures":[]}]
[match("( )*"; "gn")]
"abc"
[]
[match("a"; "gi")]
"āáàä"
[]
[match(["(bar)"])]
"foo bar"
[{"offset": 4, "length": 3, "string": "bar", "captures":[{"offset": 4, "length": 3, "string": "bar", "name": null}]}]
# offsets account for combining codepoints and multi-byte UTF-8
[match("bar")]
"ā bar with a combining codepoint U+0304"
[{"offset": 3, "length": 3, "string": "bar", "captures":[]}]
# matches with combining codepoints still count them in their length
[match("bār")]
"a bār"
[{"offset": 2, "length": 4, "string": "bār", "captures":[]}]
[match(".+?\\b")]
"ā two-codepoint grapheme"
[{"offset": 0, "length": 2, "string": "ā", "captures":[]}]
[match(["foo (?<bar123>bar)? foo", "ig"])]
"foo bar foo foo foo"
[{"offset": 0, "length": 11, "string": "foo bar foo", "captures":[{"offset": 4, "length": 3, "string": "bar", "name": "bar123"}]},{"offset":12, "length": 8, "string": "foo foo", "captures":[{"offset": -1, "length": 0, "string": null, "name": "bar123"}]}]
#test builtin
[test("( )*"; "gn")]
"abc"
[false]
[test("ā")]
"ā"
[true]
capture("(?<a>[a-z]+)-(?<n>[0-9]+)")
"xyzzy-14"
{"a":"xyzzy","n":"14"}
# jq-coded utilities built on match:
#
# The second element in these tests' inputs tests the case where the
# fromstring matches both the head and tail of the string
[.[] | sub(", "; ":")]
["a,b, c, d, e,f", ", a,b, c, d, e,f, "]
["a,b:c, d, e,f",":a,b, c, d, e,f, "]
sub("^(?<head>.)"; "Head=\(.head) Tail=")
"abcdef"
"Head=a Tail=bcdef"
[.[] | gsub(", "; ":")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
["a,b:c:d:e,f",":a,b:c:d:e,f:"]
gsub("(?<d>\\d)"; ":\(.d);")
"a1b2"
"a:1;b:2;"
gsub("a";"b")
"aaaaa"
"bbbbb"
gsub( "(.*)"; ""; "x")
""
""
gsub( ""; "a"; "g")
""
"a"
gsub( "^"; ""; "g")
"a"
"a"
# The following is a regression test and should not be construed as a requirement other than that execution should terminate:
gsub( ""; "a"; "g")
"a"
"aa"
gsub( "$"; "a"; "g")
"a"
"aa"
gsub( "^"; "a")
""
"a"
gsub("(?=u)"; "u")
"qux"
"quux"
gsub("^.*a"; "b")
"aaa"
"b"
gsub("^.*?a"; "b")
"aaa"
"baa"
# The following is for regression testing and should not be construed as a requirement:
[gsub("a"; "b", "c")]
"a"
["b","c"]
[.[] | scan(", ")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[", ",", ",", ",", ",", ",", ",", ",", "]
[.[]|[[sub(", *";":")], [gsub(", *";":")], [scan(", *")]]]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[[["a:b, c, d, e,f"],["a:b:c:d:e:f"],[",",", ",", ",", ",","]],[[":a,b, c, d, e,f, "],[":a:b:c:d:e:f:"],[", ",",",", ",", ",", ",",",", "]]]
[.[]|[[sub(", +";":")], [gsub(", +";":")], [scan(", +")]]]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[[["a,b:c, d, e,f"],["a,b:c:d:e,f"],[", ",", ",", "]],[[":a,b, c, d, e,f, "],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "]]]
# reference to named captures
gsub("(?<x>.)[^a]*"; "+\(.x)-")
"Abcabc"
"+A-+a-"
gsub("(?<x>.)(?<y>[0-9])"; "\(.x|ascii_downcase)\(.y)")
"A1 B2 CD"
"a1 b2 CD"
gsub("\\b(?<x>.)"; "\(.x|ascii_downcase)")
"ABC DEF"
"aBC dEF"
# utf-8
sub("(?<x>.)"; "\(.x)!")
""
"!"
[sub("a"; "b", "c")]
"a"
["b","c"]
[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
"aB"
["AB","aB","cB"]
[gsub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
"aB"
["AB","ab","cc"]
# splits and _nwise
[splits("")]
"ab"
["","a","b"]