mirror of
https://github.com/stedolan/jq.git
synced 2024-05-11 05:55:39 +00:00
352 lines
8.0 KiB
C
352 lines
8.0 KiB
C
![]() |
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include "jv.h"
|
||
|
#include "jv_dtoa.h"
|
||
|
jv stack[1000];
|
||
|
int stackpos = 0;
|
||
|
jv next;
|
||
|
int hasnext;
|
||
|
|
||
|
typedef const char* presult;
|
||
|
|
||
|
#define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0)
|
||
|
#ifdef __GNUC__
|
||
|
#define pfunc __attribute__((warn_unused_result)) presult
|
||
|
#else
|
||
|
#define pfunc presult
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
pfunc value(jv val) {
|
||
|
if (hasnext) return "Expected separator between values";
|
||
|
hasnext = 1;
|
||
|
next = val;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
void push(jv v) {
|
||
|
stack[stackpos++] = v;
|
||
|
}
|
||
|
|
||
|
pfunc token(char ch) {
|
||
|
switch (ch) {
|
||
|
case '[':
|
||
|
if (hasnext) return "Expected separator between values";
|
||
|
push(jv_array());
|
||
|
break;
|
||
|
|
||
|
case '{':
|
||
|
if (hasnext) return "Expected separator between values";
|
||
|
push(jv_object());
|
||
|
break;
|
||
|
|
||
|
case ':':
|
||
|
if (!hasnext)
|
||
|
return "Expected string key before ':'";
|
||
|
if (stackpos == 0 || jv_get_kind(stack[stackpos-1]) != JV_KIND_OBJECT)
|
||
|
return "':' not as part of an object";
|
||
|
if (jv_get_kind(next) != JV_KIND_STRING)
|
||
|
return "Object keys must be strings";
|
||
|
push(next);
|
||
|
hasnext = 0;
|
||
|
break;
|
||
|
|
||
|
case ',':
|
||
|
if (!hasnext)
|
||
|
return "Expected value before ','";
|
||
|
if (stackpos == 0)
|
||
|
return "',' not as part of an object or array";
|
||
|
if (jv_get_kind(stack[stackpos-1]) == JV_KIND_ARRAY) {
|
||
|
stack[stackpos-1] = jv_array_append(stack[stackpos-1], next);
|
||
|
hasnext = 0;
|
||
|
} else if (jv_get_kind(stack[stackpos-1]) == JV_KIND_STRING) {
|
||
|
assert(stackpos > 1 && jv_get_kind(stack[stackpos-2]) == JV_KIND_OBJECT);
|
||
|
stack[stackpos-2] = jv_object_set(stack[stackpos-2], stack[stackpos-1], next);
|
||
|
stackpos--;
|
||
|
hasnext = 0;
|
||
|
} else {
|
||
|
// this case hits on input like {"a", "b"}
|
||
|
return "Objects must consist of key:value pairs";
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case ']':
|
||
|
if (stackpos == 0 || jv_get_kind(stack[stackpos-1]) != JV_KIND_ARRAY)
|
||
|
return "Unmatched ']'";
|
||
|
if (hasnext) {
|
||
|
stack[stackpos-1] = jv_array_append(stack[stackpos-1], next);
|
||
|
hasnext = 0;
|
||
|
} else {
|
||
|
if (jv_array_length(jv_copy(stack[stackpos-1])) != 0) {
|
||
|
// this case hits on input like [1,2,3,]
|
||
|
return "Expected another array element";
|
||
|
}
|
||
|
}
|
||
|
hasnext = 1;
|
||
|
next = stack[--stackpos];
|
||
|
break;
|
||
|
|
||
|
case '}':
|
||
|
if (stackpos == 0)
|
||
|
return "Unmatched '}'";
|
||
|
if (hasnext) {
|
||
|
if (jv_get_kind(stack[stackpos-1]) != JV_KIND_STRING)
|
||
|
return "Objects must consist of key:value pairs";
|
||
|
assert(stackpos > 1 && jv_get_kind(stack[stackpos-2]) == JV_KIND_OBJECT);
|
||
|
stack[stackpos-2] = jv_object_set(stack[stackpos-2], stack[stackpos-1], next);
|
||
|
stackpos--;
|
||
|
hasnext = 0;
|
||
|
} else {
|
||
|
if (jv_get_kind(stack[stackpos-1]) != JV_KIND_OBJECT)
|
||
|
return "Unmatched '}'";
|
||
|
// FIXME: assert object empty
|
||
|
}
|
||
|
hasnext = 1;
|
||
|
next = stack[--stackpos];
|
||
|
break;
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
char tokenbuf[1000];
|
||
|
int tokenpos;
|
||
|
struct dtoa_context dtoa;
|
||
|
|
||
|
void tokenadd(char c) {
|
||
|
tokenbuf[tokenpos++] = c;
|
||
|
}
|
||
|
|
||
|
int unhex4(char* hex) {
|
||
|
int r = 0;
|
||
|
for (int i=0; i<4; i++) {
|
||
|
char c = *hex++;
|
||
|
int n;
|
||
|
if ('0' <= c && c <= '9') n = c - '0';
|
||
|
else if ('a' <= c && c <= 'f') n = c - 'a' + 10;
|
||
|
else if ('A' <= c && c <= 'F') n = c - 'A' + 10;
|
||
|
r <<= 4;
|
||
|
r |= n;
|
||
|
}
|
||
|
return r;
|
||
|
}
|
||
|
|
||
|
int utf8_encode(int codepoint, char* out) {
|
||
|
assert(codepoint >= 0 && codepoint <= 0x10FFFF);
|
||
|
char* start = out;
|
||
|
if (codepoint <= 0x7F) {
|
||
|
*out++ = codepoint;
|
||
|
} else if (codepoint <= 0x7FF) {
|
||
|
*out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
|
||
|
*out++ = 0x80 + ((codepoint & 0x03F));
|
||
|
} else if(codepoint <= 0xFFFF) {
|
||
|
*out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
|
||
|
*out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
|
||
|
*out++ = 0x80 + ((codepoint & 0x003F));
|
||
|
} else {
|
||
|
*out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18);
|
||
|
*out++ = 0x80 + ((codepoint & 0x03F000) >> 12);
|
||
|
*out++ = 0x80 + ((codepoint & 0x000FC0) >> 6);
|
||
|
*out++ = 0x80 + ((codepoint & 0x00003F));
|
||
|
}
|
||
|
return out - start;
|
||
|
}
|
||
|
|
||
|
pfunc found_string() {
|
||
|
char* in = tokenbuf;
|
||
|
char* out = tokenbuf;
|
||
|
char* end = tokenbuf + tokenpos;
|
||
|
|
||
|
while (in < end) {
|
||
|
char c = *in++;
|
||
|
if (c == '\\') {
|
||
|
if (in >= end)
|
||
|
return "Expected escape character at end of string";
|
||
|
c = *in++;
|
||
|
switch (c) {
|
||
|
case '\\':
|
||
|
case '"':
|
||
|
case '/': *out++ = c; break;
|
||
|
case 'b': *out++ = '\b'; break;
|
||
|
case 'f': *out++ = '\f'; break;
|
||
|
case 't': *out++ = '\t'; break;
|
||
|
case 'n': *out++ = '\n'; break;
|
||
|
case 'r': *out++ = '\r'; break;
|
||
|
|
||
|
case 'u':
|
||
|
/* ahh, the complicated case */
|
||
|
if (in + 4 > end)
|
||
|
return "Invalid \\uXXXX escape";
|
||
|
unsigned long codepoint = unhex4(in);
|
||
|
in += 4;
|
||
|
if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
|
||
|
/* who thought UTF-16 surrogate pairs were a good idea? */
|
||
|
if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
|
||
|
return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
|
||
|
unsigned long surrogate = unhex4(in+2);
|
||
|
if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
|
||
|
return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
|
||
|
in += 6;
|
||
|
codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
|
||
|
|(surrogate - 0xDC00));
|
||
|
}
|
||
|
// FIXME assert valid codepoint
|
||
|
out += utf8_encode(codepoint, out);
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
return "Invalid escape";
|
||
|
}
|
||
|
} else {
|
||
|
*out++ = c;
|
||
|
}
|
||
|
}
|
||
|
TRY(value(jv_string_sized(tokenbuf, out - tokenbuf)));
|
||
|
tokenpos=0;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
pfunc check_literal() {
|
||
|
if (tokenpos == 0) return 0;
|
||
|
|
||
|
const char* pattern = 0;
|
||
|
int plen;
|
||
|
jv v;
|
||
|
switch (tokenbuf[0]) {
|
||
|
case 't': pattern = "true"; plen = 4; v = jv_true(); break;
|
||
|
case 'f': pattern = "false"; plen = 5; v = jv_false(); break;
|
||
|
case 'n': pattern = "null"; plen = 4; v = jv_null(); break;
|
||
|
}
|
||
|
if (pattern) {
|
||
|
if (tokenpos != plen) return "Invalid literal";
|
||
|
for (int i=0; i<plen; i++)
|
||
|
if (tokenbuf[i] != pattern[i])
|
||
|
return "Invalid literal";
|
||
|
TRY(value(v));
|
||
|
} else {
|
||
|
// FIXME: better parser
|
||
|
tokenbuf[tokenpos] = 0; // FIXME: invalid
|
||
|
char* end = 0;
|
||
|
double d = jvp_strtod(&dtoa, tokenbuf, &end);
|
||
|
if (end == 0 || *end != 0)
|
||
|
return "Invalid numeric literal";
|
||
|
TRY(value(jv_number(d)));
|
||
|
}
|
||
|
tokenpos=0;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
typedef enum {
|
||
|
LITERAL,
|
||
|
WHITESPACE,
|
||
|
STRUCTURE,
|
||
|
QUOTE,
|
||
|
INVALID
|
||
|
} chclass;
|
||
|
|
||
|
chclass classify(char c) {
|
||
|
switch (c) {
|
||
|
case ' ':
|
||
|
case '\t':
|
||
|
case '\r':
|
||
|
case '\n':
|
||
|
return WHITESPACE;
|
||
|
case '"':
|
||
|
return QUOTE;
|
||
|
case '[':
|
||
|
case ',':
|
||
|
case ']':
|
||
|
case '{':
|
||
|
case ':':
|
||
|
case '}':
|
||
|
return STRUCTURE;
|
||
|
default:
|
||
|
return LITERAL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
enum state {
|
||
|
NORMAL,
|
||
|
STRING,
|
||
|
STRING_ESCAPE
|
||
|
};
|
||
|
|
||
|
enum state st = NORMAL;
|
||
|
|
||
|
pfunc scan(char ch) {
|
||
|
if (st == NORMAL) {
|
||
|
chclass cls = classify(ch);
|
||
|
if (cls != LITERAL) {
|
||
|
TRY(check_literal());
|
||
|
}
|
||
|
switch (cls) {
|
||
|
case LITERAL:
|
||
|
tokenadd(ch);
|
||
|
break;
|
||
|
case WHITESPACE:
|
||
|
break;
|
||
|
case QUOTE:
|
||
|
st = STRING;
|
||
|
break;
|
||
|
case STRUCTURE:
|
||
|
TRY(token(ch));
|
||
|
break;
|
||
|
case INVALID:
|
||
|
return "Invalid character";
|
||
|
}
|
||
|
} else {
|
||
|
if (ch == '"' && st == STRING) {
|
||
|
TRY(found_string());
|
||
|
st = NORMAL;
|
||
|
} else {
|
||
|
tokenadd(ch);
|
||
|
if (ch == '\\' && st == STRING) {
|
||
|
st = STRING_ESCAPE;
|
||
|
} else {
|
||
|
st = STRING;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
pfunc finish() {
|
||
|
assert(st == NORMAL);
|
||
|
TRY(check_literal());
|
||
|
|
||
|
if (stackpos != 0)
|
||
|
return "Unfinished JSON term";
|
||
|
|
||
|
// this will happen on the empty string
|
||
|
if (!hasnext)
|
||
|
return "Expected JSON value";
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
int main(int argc, char* argv[]) {
|
||
|
assert(argc == 2);
|
||
|
jvp_dtoa_context_init(&dtoa);
|
||
|
char* p = argv[1];
|
||
|
char ch;
|
||
|
while ((ch = *p++)) {
|
||
|
presult msg = scan(ch);
|
||
|
if (msg){
|
||
|
printf("ERROR: %s\n", msg);
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
presult msg = finish();
|
||
|
if (msg) {
|
||
|
printf("ERROR: %s\n", msg);
|
||
|
return 1;
|
||
|
}
|
||
|
jvp_dtoa_context_free(&dtoa);
|
||
|
jv_dump(next);
|
||
|
printf("\n");
|
||
|
return 0;
|
||
|
}
|