1
0
mirror of https://github.com/stedolan/jq.git synced 2024-05-11 05:55:39 +00:00

Add streaming utilities (fix #827)

This commit is contained in:
Nicolas Williams
2015-06-26 20:16:23 -05:00
parent 5108a451ca
commit 25d47ca08e
4 changed files with 97 additions and 53 deletions

View File

@ -1521,6 +1521,45 @@ static const char* const jq_builtins[] = {
"def ascii_upcase:"
" explode | map( if 97 <= . and . <= 122 then . - 32 else . end) | implode;",
// Streaming utilities
"def truncate_stream(stream):"
" . as $n | null | stream | . as $input | if (.[0]|length) > $n then setpath([0];$input[0][1:]) else empty end;",
"def fromstream(i):"
" foreach i as $item ("
" [null,false,null,false];"
" if ($item[0]|length) == 0 then [null,false,.[2],.[3]]"
" elif ($item|length) == 1 and ($item[0]|length) < 2 then [null,false,.[0],.[1]]"
" else . end |"
" . as $state |"
" if ($item|length) > 1 and ($item[0]|length) > 0 then"
" [.[0]|setpath(($item|.[0]); ($item|.[1])), "
" true, "
" $state[2], "
" $state[3]] "
" else ."
" end;"
" if ($item[0]|length) == 1 and ($item|length == 1) and .[3] then .[2] else empty end,"
" if ($item[0]|length) == 0 then $item[1] else empty end"
" );",
"def tostream:\n"
" {string:true,number:true,boolean:true,null:true} as $leaf_types |\n"
" . as $dot |\n"
" if $leaf_types[$dot|type] or length==0 then [[],$dot]\n"
" else\n"
" # We really need a _streaming_ form of `keys`.\n"
" # We can use `range` for arrays, but not for objects.\n"
" keys as $keys |\n"
" $keys[-1] as $last|\n"
" ((# for each key\n"
" $keys[] | . as $key |\n"
" $dot[$key] | . as $dot |\n"
" # recurse on each key/value\n"
" tostream|.[0]|=[$key]+.),\n"
" # then add the closing marker\n"
" [[$last]])\n"
" end;",
// # Assuming the input array is sorted, bsearch/1 returns
// # the index of the target if the target is in the input array; and otherwise
// # (-1 - ix), where ix is the insertion point that would leave the array sorted.

View File

@ -2500,6 +2500,63 @@ sections:
Returns the line number of the input currently being filtered.
- title: 'Streaming'
body: |
With the `--stream` option jq can parse input texts in a streaming
fashion, allowing jq programs to start processing large JSON texts
immediately rather than after the parse completes. If you have a
single JSON text that is 1GB in size, streaming it will allow you
to process it much more quickly.
However, streaming isn't easy to deal with as the jq program will
have `[<path>, <leaf-value>]` (and a few other forms) as inputs.
Several builtins are provided to make handling streams easier.
The examples below use the the streamed form of `[0,[1]]`, which
is `[[0],1],[[1,0],2],[[1,0]],[[1]])]`.
Streaming forms include `[<path>, <leaf-value>]` (to indicate any
scalar value, empty array, or empty object), and `[<path>]` (to
indicate the end of an array or object). Future versions of jq
run with `--stream` and `-seq` may output additional forms such as
`["error message"]` when an input text fails to parse.
entries:
- title: "`truncate_stream(stream_expression)`"
body: |
Consumes a number as input and truncates the corresponding
number of path elements from the left of the outputs of the
given streaming expression.
examples:
- program: '[1|truncate_stream([[0],1],[[1,0],2],[[1,0]],[[1]])]'
input: '1'
output: ['[[[0],2],[[0]]]']
- title: "`fromstream(stream_expression)`"
body: |
Outputs values corresponding to the stream expression's
outputs.
examples:
- program: 'fromstream(1|truncate_stream([[0],1],[[1,0],2],[[1,0]],[[1]]))'
input: 'null'
output: ['[2]']
- title: "`tostream`"
body: |
The `tostream` builtin outputs the streamed form of its input.
examples:
- program: '. as $dot|fromstream($dot|tostream)|.==$dot'
input: '[0,[1,{"a":1},{"b":2}]]'
output: ['true']
- title: Assignment
body: |

View File

@ -1,49 +0,0 @@
# Filter and adjust streamed values so that only values from the .th
# level are output.
def trunc(stream):
. as $n | stream | . as $input | if (.[0]|length) > $n then setpath([0];$input[0][$n:]) else empty end;
# Reduce streamed values back to normal
def tovalues(i):
def debug(msg): . as $dot | [msg, .] | debug | $dot;
foreach i as $item (
[null,false,null];
# Updator
#
# If the new $item is a top-level value,
# then clear out the current value
. as [$cur, $cur_isvalid, $prev] |
$item as [$path, $leaf] |
($item|length > 1) as $has_leaf |
($item|length == 1) as $closing |
($path|length) as $plen |
# if the new $item terminates the current value, then cur is ready
# for extraction and we'll start building a new value with the next
# inputs
if ($plen == 0) or # top-level scalar
($closing and $plen < 2) then [null,false,$cur]
# else continue building up cur
else . end |
. as [$cur, $cur_isvalid, $prev] |
# If the new $item has a leaf, upate the current value
if $has_leaf and $plen > 0 then
[$cur|setpath(($path); $leaf), # update current value
true, # current value is now valid (if, perhaps, incomplete)
$prev] # previous value is unchanged
else .
end;
# Extractor
#
. as [$cur, $cur_isvalid, $prev] |
$item as [$path, $leaf] |
($item|length > 1) as $has_leaf |
($item|length == 1) as $closing |
($path|length) as $plen |
# If previous value is valid, output it
if $plen == 1 and $closing then $prev else empty end,
# and/or if the new $item is a top-level scalar, output it
if $plen == 0 then $leaf else empty end
);

View File

@ -77,9 +77,6 @@ fi
## Test JSON sequence support
## XXX If we add a `stream_fromjson` builtin then we can move these tests
## into tests/all.test
cat > $d/expected <<EOF
ignoring parse error: Truncated value at line 2, column 5
ignoring parse error: Truncated value at line 2, column 25
@ -149,7 +146,7 @@ if which seq > /dev/null 2>&1; then
fi
dd "if=tests/torture/input0.json" bs=$i count=1 2>/dev/null |
$VALGRIND $JQ -cn --stream -L "$mods" 'import "streaming" as streaming; streaming::tovalues(inputs)' > $d/out1 2>$d/err || true
$VALGRIND $JQ -cn --stream 'fromstream(inputs)' > $d/out1 2>$d/err || true
if [ -n "$VALGRIND" ]; then
grep '^==[0-9][0-9]*== ERROR SUMMARY: 0 errors' $d/err > /dev/null
else