mirror of
https://github.com/stedolan/jq.git
synced 2024-05-11 05:55:39 +00:00
Add streaming utilities (fix #827)
This commit is contained in:
39
builtin.c
39
builtin.c
@ -1521,6 +1521,45 @@ static const char* const jq_builtins[] = {
|
||||
"def ascii_upcase:"
|
||||
" explode | map( if 97 <= . and . <= 122 then . - 32 else . end) | implode;",
|
||||
|
||||
// Streaming utilities
|
||||
"def truncate_stream(stream):"
|
||||
" . as $n | null | stream | . as $input | if (.[0]|length) > $n then setpath([0];$input[0][1:]) else empty end;",
|
||||
"def fromstream(i):"
|
||||
" foreach i as $item ("
|
||||
" [null,false,null,false];"
|
||||
" if ($item[0]|length) == 0 then [null,false,.[2],.[3]]"
|
||||
" elif ($item|length) == 1 and ($item[0]|length) < 2 then [null,false,.[0],.[1]]"
|
||||
" else . end |"
|
||||
" . as $state |"
|
||||
" if ($item|length) > 1 and ($item[0]|length) > 0 then"
|
||||
" [.[0]|setpath(($item|.[0]); ($item|.[1])), "
|
||||
" true, "
|
||||
" $state[2], "
|
||||
" $state[3]] "
|
||||
" else ."
|
||||
" end;"
|
||||
" if ($item[0]|length) == 1 and ($item|length == 1) and .[3] then .[2] else empty end,"
|
||||
" if ($item[0]|length) == 0 then $item[1] else empty end"
|
||||
" );",
|
||||
"def tostream:\n"
|
||||
" {string:true,number:true,boolean:true,null:true} as $leaf_types |\n"
|
||||
" . as $dot |\n"
|
||||
" if $leaf_types[$dot|type] or length==0 then [[],$dot]\n"
|
||||
" else\n"
|
||||
" # We really need a _streaming_ form of `keys`.\n"
|
||||
" # We can use `range` for arrays, but not for objects.\n"
|
||||
" keys as $keys |\n"
|
||||
" $keys[-1] as $last|\n"
|
||||
" ((# for each key\n"
|
||||
" $keys[] | . as $key |\n"
|
||||
" $dot[$key] | . as $dot |\n"
|
||||
" # recurse on each key/value\n"
|
||||
" tostream|.[0]|=[$key]+.),\n"
|
||||
" # then add the closing marker\n"
|
||||
" [[$last]])\n"
|
||||
" end;",
|
||||
|
||||
|
||||
// # Assuming the input array is sorted, bsearch/1 returns
|
||||
// # the index of the target if the target is in the input array; and otherwise
|
||||
// # (-1 - ix), where ix is the insertion point that would leave the array sorted.
|
||||
|
@ -2500,6 +2500,63 @@ sections:
|
||||
|
||||
Returns the line number of the input currently being filtered.
|
||||
|
||||
- title: 'Streaming'
|
||||
body: |
|
||||
|
||||
With the `--stream` option jq can parse input texts in a streaming
|
||||
fashion, allowing jq programs to start processing large JSON texts
|
||||
immediately rather than after the parse completes. If you have a
|
||||
single JSON text that is 1GB in size, streaming it will allow you
|
||||
to process it much more quickly.
|
||||
|
||||
However, streaming isn't easy to deal with as the jq program will
|
||||
have `[<path>, <leaf-value>]` (and a few other forms) as inputs.
|
||||
|
||||
Several builtins are provided to make handling streams easier.
|
||||
|
||||
The examples below use the the streamed form of `[0,[1]]`, which
|
||||
is `[[0],1],[[1,0],2],[[1,0]],[[1]])]`.
|
||||
|
||||
Streaming forms include `[<path>, <leaf-value>]` (to indicate any
|
||||
scalar value, empty array, or empty object), and `[<path>]` (to
|
||||
indicate the end of an array or object). Future versions of jq
|
||||
run with `--stream` and `-seq` may output additional forms such as
|
||||
`["error message"]` when an input text fails to parse.
|
||||
|
||||
entries:
|
||||
- title: "`truncate_stream(stream_expression)`"
|
||||
body: |
|
||||
|
||||
Consumes a number as input and truncates the corresponding
|
||||
number of path elements from the left of the outputs of the
|
||||
given streaming expression.
|
||||
|
||||
examples:
|
||||
- program: '[1|truncate_stream([[0],1],[[1,0],2],[[1,0]],[[1]])]'
|
||||
input: '1'
|
||||
output: ['[[[0],2],[[0]]]']
|
||||
|
||||
- title: "`fromstream(stream_expression)`"
|
||||
body: |
|
||||
|
||||
Outputs values corresponding to the stream expression's
|
||||
outputs.
|
||||
|
||||
examples:
|
||||
- program: 'fromstream(1|truncate_stream([[0],1],[[1,0],2],[[1,0]],[[1]]))'
|
||||
input: 'null'
|
||||
output: ['[2]']
|
||||
|
||||
- title: "`tostream`"
|
||||
body: |
|
||||
|
||||
The `tostream` builtin outputs the streamed form of its input.
|
||||
|
||||
examples:
|
||||
- program: '. as $dot|fromstream($dot|tostream)|.==$dot'
|
||||
input: '[0,[1,{"a":1},{"b":2}]]'
|
||||
output: ['true']
|
||||
|
||||
- title: Assignment
|
||||
body: |
|
||||
|
||||
|
@ -1,49 +0,0 @@
|
||||
|
||||
# Filter and adjust streamed values so that only values from the .th
|
||||
# level are output.
|
||||
def trunc(stream):
|
||||
. as $n | stream | . as $input | if (.[0]|length) > $n then setpath([0];$input[0][$n:]) else empty end;
|
||||
|
||||
# Reduce streamed values back to normal
|
||||
def tovalues(i):
|
||||
def debug(msg): . as $dot | [msg, .] | debug | $dot;
|
||||
foreach i as $item (
|
||||
[null,false,null];
|
||||
|
||||
# Updator
|
||||
#
|
||||
# If the new $item is a top-level value,
|
||||
# then clear out the current value
|
||||
. as [$cur, $cur_isvalid, $prev] |
|
||||
$item as [$path, $leaf] |
|
||||
($item|length > 1) as $has_leaf |
|
||||
($item|length == 1) as $closing |
|
||||
($path|length) as $plen |
|
||||
# if the new $item terminates the current value, then cur is ready
|
||||
# for extraction and we'll start building a new value with the next
|
||||
# inputs
|
||||
if ($plen == 0) or # top-level scalar
|
||||
($closing and $plen < 2) then [null,false,$cur]
|
||||
# else continue building up cur
|
||||
else . end |
|
||||
. as [$cur, $cur_isvalid, $prev] |
|
||||
# If the new $item has a leaf, upate the current value
|
||||
if $has_leaf and $plen > 0 then
|
||||
[$cur|setpath(($path); $leaf), # update current value
|
||||
true, # current value is now valid (if, perhaps, incomplete)
|
||||
$prev] # previous value is unchanged
|
||||
else .
|
||||
end;
|
||||
|
||||
# Extractor
|
||||
#
|
||||
. as [$cur, $cur_isvalid, $prev] |
|
||||
$item as [$path, $leaf] |
|
||||
($item|length > 1) as $has_leaf |
|
||||
($item|length == 1) as $closing |
|
||||
($path|length) as $plen |
|
||||
# If previous value is valid, output it
|
||||
if $plen == 1 and $closing then $prev else empty end,
|
||||
# and/or if the new $item is a top-level scalar, output it
|
||||
if $plen == 0 then $leaf else empty end
|
||||
);
|
@ -77,9 +77,6 @@ fi
|
||||
|
||||
## Test JSON sequence support
|
||||
|
||||
## XXX If we add a `stream_fromjson` builtin then we can move these tests
|
||||
## into tests/all.test
|
||||
|
||||
cat > $d/expected <<EOF
|
||||
ignoring parse error: Truncated value at line 2, column 5
|
||||
ignoring parse error: Truncated value at line 2, column 25
|
||||
@ -149,7 +146,7 @@ if which seq > /dev/null 2>&1; then
|
||||
fi
|
||||
|
||||
dd "if=tests/torture/input0.json" bs=$i count=1 2>/dev/null |
|
||||
$VALGRIND $JQ -cn --stream -L "$mods" 'import "streaming" as streaming; streaming::tovalues(inputs)' > $d/out1 2>$d/err || true
|
||||
$VALGRIND $JQ -cn --stream 'fromstream(inputs)' > $d/out1 2>$d/err || true
|
||||
if [ -n "$VALGRIND" ]; then
|
||||
grep '^==[0-9][0-9]*== ERROR SUMMARY: 0 errors' $d/err > /dev/null
|
||||
else
|
||||
|
Reference in New Issue
Block a user