mirror of
https://github.com/stedolan/jq.git
synced 2024-05-11 05:55:39 +00:00
This reverts commit 77936a594d797c480f26bfcef3636a74588a6918. There are too many odd bugs in this mode, and it turns out to be a bad idea anyways. Instead, in the future a better option will be to pursue alternative parsers, such as: - streaming parser that outputs only when a new leaf value is added or an array/object is opened/closed; options here include whether to include a path in each output; - parsers for binary JSON encodings (there's a variety of them). Then one might run jq with a streaming parser and use `reduce` to coalesce inputs from some depth down (instead of from one level down as the reverted commit had intended). Besides, a fully streaming parser is desirable in some cases, therefore we should have such a thing as an option. I've explored modifying the current parser to support a streaming option, but it only makes the code very difficult to follow, which is one reason that alternate parsers makes sense. At any rate, this is all for the future. For now there's no streaming of individual texts, just text sequences.
487 lines
13 KiB
C
487 lines
13 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include "jv.h"
|
|
#include "jv_dtoa.h"
|
|
#include "jv_unicode.h"
|
|
#include "jv_alloc.h"
|
|
#include "jv_dtoa.h"
|
|
|
|
typedef const char* presult;
|
|
|
|
#define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0)
|
|
#ifdef __GNUC__
|
|
#define pfunc __attribute__((warn_unused_result)) presult
|
|
#else
|
|
#define pfunc presult
|
|
#endif
|
|
|
|
struct jv_parser {
|
|
const char* curr_buf;
|
|
int curr_buf_length;
|
|
int curr_buf_pos;
|
|
int curr_buf_is_partial;
|
|
unsigned bom_strip_position;
|
|
|
|
jv* stack;
|
|
int stackpos;
|
|
int stacklen;
|
|
jv next;
|
|
|
|
char* tokenbuf;
|
|
int tokenpos;
|
|
int tokenlen;
|
|
|
|
int line, column;
|
|
|
|
struct dtoa_context dtoa;
|
|
|
|
enum {
|
|
JV_PARSER_NORMAL,
|
|
JV_PARSER_STRING,
|
|
JV_PARSER_STRING_ESCAPE
|
|
} st;
|
|
};
|
|
|
|
|
|
static void parser_init(struct jv_parser* p) {
|
|
p->stack = 0;
|
|
p->stacklen = p->stackpos = 0;
|
|
p->next = jv_invalid();
|
|
p->tokenbuf = 0;
|
|
p->tokenlen = p->tokenpos = 0;
|
|
p->st = JV_PARSER_NORMAL;
|
|
p->curr_buf = 0;
|
|
p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0;
|
|
p->bom_strip_position = 0;
|
|
p->line = 1;
|
|
p->column = 0;
|
|
jvp_dtoa_context_init(&p->dtoa);
|
|
}
|
|
|
|
static void parser_free(struct jv_parser* p) {
|
|
jv_free(p->next);
|
|
for (int i=0; i<p->stackpos; i++)
|
|
jv_free(p->stack[i]);
|
|
jv_mem_free(p->stack);
|
|
jv_mem_free(p->tokenbuf);
|
|
jvp_dtoa_context_free(&p->dtoa);
|
|
}
|
|
|
|
static pfunc value(struct jv_parser* p, jv val) {
|
|
if (jv_is_valid(p->next)) return "Expected separator between values";
|
|
jv_free(p->next);
|
|
p->next = val;
|
|
return 0;
|
|
}
|
|
|
|
static void push(struct jv_parser* p, jv v) {
|
|
assert(p->stackpos <= p->stacklen);
|
|
if (p->stackpos == p->stacklen) {
|
|
p->stacklen = p->stacklen * 2 + 10;
|
|
p->stack = jv_mem_realloc(p->stack, p->stacklen * sizeof(jv));
|
|
}
|
|
assert(p->stackpos < p->stacklen);
|
|
p->stack[p->stackpos++] = v;
|
|
}
|
|
|
|
static pfunc token(struct jv_parser* p, char ch) {
|
|
switch (ch) {
|
|
case '[':
|
|
if (jv_is_valid(p->next)) return "Expected separator between values";
|
|
push(p, jv_array());
|
|
break;
|
|
|
|
case '{':
|
|
if (jv_is_valid(p->next)) return "Expected separator between values";
|
|
push(p, jv_object());
|
|
break;
|
|
|
|
case ':':
|
|
if (!jv_is_valid(p->next))
|
|
return "Expected string key before ':'";
|
|
if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
|
|
return "':' not as part of an object";
|
|
if (jv_get_kind(p->next) != JV_KIND_STRING)
|
|
return "Object keys must be strings";
|
|
push(p, p->next);
|
|
p->next = jv_invalid();
|
|
break;
|
|
|
|
case ',':
|
|
if (!jv_is_valid(p->next))
|
|
return "Expected value before ','";
|
|
if (p->stackpos == 0)
|
|
return "',' not as part of an object or array";
|
|
if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) {
|
|
p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
|
|
p->next = jv_invalid();
|
|
} else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) {
|
|
assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
|
|
p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
|
|
p->stack[p->stackpos-1], p->next);
|
|
p->stackpos--;
|
|
p->next = jv_invalid();
|
|
} else {
|
|
// this case hits on input like {"a", "b"}
|
|
return "Objects must consist of key:value pairs";
|
|
}
|
|
break;
|
|
|
|
case ']':
|
|
if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY)
|
|
return "Unmatched ']'";
|
|
if (jv_is_valid(p->next)) {
|
|
p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
|
|
p->next = jv_invalid();
|
|
} else {
|
|
if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) {
|
|
// this case hits on input like [1,2,3,]
|
|
return "Expected another array element";
|
|
}
|
|
}
|
|
jv_free(p->next);
|
|
p->next = p->stack[--p->stackpos];
|
|
break;
|
|
|
|
case '}':
|
|
if (p->stackpos == 0)
|
|
return "Unmatched '}'";
|
|
if (jv_is_valid(p->next)) {
|
|
if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING)
|
|
return "Objects must consist of key:value pairs";
|
|
assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
|
|
p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2],
|
|
p->stack[p->stackpos-1], p->next);
|
|
p->stackpos--;
|
|
p->next = jv_invalid();
|
|
} else {
|
|
if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
|
|
return "Unmatched '}'";
|
|
if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0)
|
|
return "Expected another key-value pair";
|
|
}
|
|
jv_free(p->next);
|
|
p->next = p->stack[--p->stackpos];
|
|
break;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
static void tokenadd(struct jv_parser* p, char c) {
|
|
assert(p->tokenpos <= p->tokenlen);
|
|
if (p->tokenpos == p->tokenlen) {
|
|
p->tokenlen = p->tokenlen*2 + 256;
|
|
p->tokenbuf = jv_mem_realloc(p->tokenbuf, p->tokenlen);
|
|
}
|
|
assert(p->tokenpos < p->tokenlen);
|
|
p->tokenbuf[p->tokenpos++] = c;
|
|
}
|
|
|
|
static int unhex4(char* hex) {
|
|
int r = 0;
|
|
for (int i=0; i<4; i++) {
|
|
char c = *hex++;
|
|
int n;
|
|
if ('0' <= c && c <= '9') n = c - '0';
|
|
else if ('a' <= c && c <= 'f') n = c - 'a' + 10;
|
|
else if ('A' <= c && c <= 'F') n = c - 'A' + 10;
|
|
else return -1;
|
|
r <<= 4;
|
|
r |= n;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
static pfunc found_string(struct jv_parser* p) {
|
|
char* in = p->tokenbuf;
|
|
char* out = p->tokenbuf;
|
|
char* end = p->tokenbuf + p->tokenpos;
|
|
|
|
while (in < end) {
|
|
char c = *in++;
|
|
if (c == '\\') {
|
|
if (in >= end)
|
|
return "Expected escape character at end of string";
|
|
c = *in++;
|
|
switch (c) {
|
|
case '\\':
|
|
case '"':
|
|
case '/': *out++ = c; break;
|
|
case 'b': *out++ = '\b'; break;
|
|
case 'f': *out++ = '\f'; break;
|
|
case 't': *out++ = '\t'; break;
|
|
case 'n': *out++ = '\n'; break;
|
|
case 'r': *out++ = '\r'; break;
|
|
|
|
case 'u':
|
|
/* ahh, the complicated case */
|
|
if (in + 4 > end)
|
|
return "Invalid \\uXXXX escape";
|
|
int hexvalue = unhex4(in);
|
|
if (hexvalue < 0)
|
|
return "Invalid characters in \\uXXXX escape";
|
|
unsigned long codepoint = (unsigned long)hexvalue;
|
|
in += 4;
|
|
if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
|
|
/* who thought UTF-16 surrogate pairs were a good idea? */
|
|
if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
|
|
return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
|
|
unsigned long surrogate = unhex4(in+2);
|
|
if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
|
|
return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
|
|
in += 6;
|
|
codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
|
|
|(surrogate - 0xDC00));
|
|
}
|
|
// FIXME assert valid codepoint
|
|
out += jvp_utf8_encode(codepoint, out);
|
|
break;
|
|
|
|
default:
|
|
return "Invalid escape";
|
|
}
|
|
} else {
|
|
*out++ = c;
|
|
}
|
|
}
|
|
TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
|
|
p->tokenpos = 0;
|
|
return 0;
|
|
}
|
|
|
|
static pfunc check_literal(struct jv_parser* p) {
|
|
if (p->tokenpos == 0) return 0;
|
|
|
|
const char* pattern = 0;
|
|
int plen;
|
|
jv v;
|
|
switch (p->tokenbuf[0]) {
|
|
case 't': pattern = "true"; plen = 4; v = jv_true(); break;
|
|
case 'f': pattern = "false"; plen = 5; v = jv_false(); break;
|
|
case 'n': pattern = "null"; plen = 4; v = jv_null(); break;
|
|
}
|
|
if (pattern) {
|
|
if (p->tokenpos != plen) return "Invalid literal";
|
|
for (int i=0; i<plen; i++)
|
|
if (p->tokenbuf[i] != pattern[i])
|
|
return "Invalid literal";
|
|
TRY(value(p, v));
|
|
} else {
|
|
// FIXME: better parser
|
|
p->tokenbuf[p->tokenpos] = 0; // FIXME: invalid
|
|
char* end = 0;
|
|
double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end);
|
|
if (end == 0 || *end != 0)
|
|
return "Invalid numeric literal";
|
|
TRY(value(p, jv_number(d)));
|
|
}
|
|
p->tokenpos = 0;
|
|
return 0;
|
|
}
|
|
|
|
typedef enum {
|
|
LITERAL,
|
|
WHITESPACE,
|
|
STRUCTURE,
|
|
QUOTE,
|
|
INVALID
|
|
} chclass;
|
|
|
|
static chclass classify(char c) {
|
|
switch (c) {
|
|
case ' ':
|
|
case '\t':
|
|
case '\r':
|
|
case '\n':
|
|
return WHITESPACE;
|
|
case '"':
|
|
return QUOTE;
|
|
case '[':
|
|
case ',':
|
|
case ']':
|
|
case '{':
|
|
case ':':
|
|
case '}':
|
|
return STRUCTURE;
|
|
default:
|
|
return LITERAL;
|
|
}
|
|
}
|
|
|
|
|
|
static const presult OK = "output produced";
|
|
|
|
static int check_done(struct jv_parser* p, jv* out) {
|
|
if (p->stackpos == 0 && jv_is_valid(p->next)) {
|
|
*out = p->next;
|
|
p->next = jv_invalid();
|
|
return 1;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
static pfunc scan(struct jv_parser* p, char ch, jv* out) {
|
|
p->column++;
|
|
if (ch == '\n') {
|
|
p->line++;
|
|
p->column = 0;
|
|
}
|
|
presult answer = 0;
|
|
if (p->st == JV_PARSER_NORMAL) {
|
|
chclass cls = classify(ch);
|
|
if (cls != LITERAL) {
|
|
TRY(check_literal(p));
|
|
if (check_done(p, out)) answer = OK;
|
|
}
|
|
switch (cls) {
|
|
case LITERAL:
|
|
tokenadd(p, ch);
|
|
break;
|
|
case WHITESPACE:
|
|
break;
|
|
case QUOTE:
|
|
p->st = JV_PARSER_STRING;
|
|
break;
|
|
case STRUCTURE:
|
|
TRY(token(p, ch));
|
|
break;
|
|
case INVALID:
|
|
return "Invalid character";
|
|
}
|
|
if (check_done(p, out)) answer = OK;
|
|
} else {
|
|
if (ch == '"' && p->st == JV_PARSER_STRING) {
|
|
TRY(found_string(p));
|
|
p->st = JV_PARSER_NORMAL;
|
|
if (check_done(p, out)) answer = OK;
|
|
} else {
|
|
tokenadd(p, ch);
|
|
if (ch == '\\' && p->st == JV_PARSER_STRING) {
|
|
p->st = JV_PARSER_STRING_ESCAPE;
|
|
} else {
|
|
p->st = JV_PARSER_STRING;
|
|
}
|
|
}
|
|
}
|
|
return answer;
|
|
}
|
|
|
|
struct jv_parser* jv_parser_new() {
|
|
struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser));
|
|
parser_init(p);
|
|
return p;
|
|
}
|
|
|
|
void jv_parser_free(struct jv_parser* p) {
|
|
parser_free(p);
|
|
jv_mem_free(p);
|
|
}
|
|
|
|
static const unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF};
|
|
|
|
void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) {
|
|
assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length)
|
|
&& "previous buffer not exhausted");
|
|
while (p->bom_strip_position < sizeof(UTF8_BOM)) {
|
|
if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) {
|
|
// matched a BOM character
|
|
buf++;
|
|
length--;
|
|
p->bom_strip_position++;
|
|
} else {
|
|
if (p->bom_strip_position == 0) {
|
|
// no BOM in this document
|
|
p->bom_strip_position = sizeof(UTF8_BOM);
|
|
} else {
|
|
// malformed BOM (prefix present, rest missing)
|
|
p->bom_strip_position = 0xff;
|
|
}
|
|
}
|
|
}
|
|
p->curr_buf = buf;
|
|
p->curr_buf_length = length;
|
|
p->curr_buf_pos = 0;
|
|
p->curr_buf_is_partial = is_partial;
|
|
}
|
|
|
|
jv jv_parser_next(struct jv_parser* p) {
|
|
assert(p->curr_buf && "a buffer must be provided");
|
|
if (p->bom_strip_position == 0xff) return jv_invalid_with_msg(jv_string("Malformed BOM"));
|
|
jv value;
|
|
presult msg = 0;
|
|
while (!msg && p->curr_buf_pos < p->curr_buf_length) {
|
|
char ch = p->curr_buf[p->curr_buf_pos++];
|
|
msg = scan(p, ch, &value);
|
|
}
|
|
if (msg == OK) {
|
|
return value;
|
|
} else if (msg) {
|
|
return jv_invalid_with_msg(jv_string_fmt("%s at line %d, column %d", msg, p->line, p->column));
|
|
} else if (p->curr_buf_is_partial) {
|
|
assert(p->curr_buf_pos == p->curr_buf_length);
|
|
// need another buffer
|
|
return jv_invalid();
|
|
} else {
|
|
assert(p->curr_buf_pos == p->curr_buf_length);
|
|
// at EOF
|
|
if (p->st != JV_PARSER_NORMAL)
|
|
return jv_invalid_with_msg(jv_string("Unfinished string"));
|
|
if ((msg = check_literal(p)))
|
|
return jv_invalid_with_msg(jv_string(msg));
|
|
if (p->stackpos != 0)
|
|
return jv_invalid_with_msg(jv_string("Unfinished JSON term"));
|
|
// p->next is either invalid (nothing here but no syntax error)
|
|
// or valid (this is the value). either way it's the thing to return
|
|
value = p->next;
|
|
p->next = jv_invalid();
|
|
return value;
|
|
}
|
|
}
|
|
|
|
jv jv_parse_sized(const char* string, int length) {
|
|
struct jv_parser parser;
|
|
parser_init(&parser);
|
|
jv_parser_set_buf(&parser, string, length, 0);
|
|
jv value = jv_parser_next(&parser);
|
|
if (jv_is_valid(value)) {
|
|
jv next = jv_parser_next(&parser);
|
|
if (jv_is_valid(next)) {
|
|
// multiple JSON values, we only wanted one
|
|
jv_free(value);
|
|
jv_free(next);
|
|
value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
|
|
} else if (jv_invalid_has_msg(jv_copy(next))) {
|
|
// parser error after the first JSON value
|
|
jv_free(value);
|
|
value = next;
|
|
} else {
|
|
// a single valid JSON value
|
|
jv_free(next);
|
|
}
|
|
} else if (jv_invalid_has_msg(jv_copy(value))) {
|
|
// parse error, we'll return it
|
|
} else {
|
|
// no value at all
|
|
jv_free(value);
|
|
value = jv_invalid_with_msg(jv_string("Expected JSON value"));
|
|
}
|
|
parser_free(&parser);
|
|
|
|
if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) {
|
|
jv msg = jv_invalid_get_msg(value);
|
|
value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')",
|
|
jv_string_value(msg),
|
|
string));
|
|
jv_free(msg);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
jv jv_parse(const char* string) {
|
|
return jv_parse_sized(string, strlen(string));
|
|
}
|