mirror of
https://github.com/stedolan/jq.git
synced 2024-05-11 05:55:39 +00:00
Ignore a UTF-8 BOM if one appears at the start of a JSON document.
Closes #45.
This commit is contained in:
20
jv_parse.c
20
jv_parse.c
@ -24,6 +24,7 @@ void jv_parser_init(struct jv_parser* p) {
|
||||
p->st = JV_PARSER_NORMAL;
|
||||
p->curr_buf = 0;
|
||||
p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0;
|
||||
p->bom_strip_position = 0;
|
||||
jvp_dtoa_context_init(&p->dtoa);
|
||||
}
|
||||
|
||||
@ -332,9 +333,27 @@ static pfunc scan(struct jv_parser* p, char ch, jv* out) {
|
||||
return answer;
|
||||
}
|
||||
|
||||
static unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF};
|
||||
|
||||
void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) {
|
||||
assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length)
|
||||
&& "previous buffer not exhausted");
|
||||
while (p->bom_strip_position < sizeof(UTF8_BOM)) {
|
||||
if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) {
|
||||
// matched a BOM character
|
||||
buf++;
|
||||
length--;
|
||||
p->bom_strip_position++;
|
||||
} else {
|
||||
if (p->bom_strip_position == 0) {
|
||||
// no BOM in this document
|
||||
p->bom_strip_position = sizeof(UTF8_BOM);
|
||||
} else {
|
||||
// malformed BOM (prefix present, rest missing)
|
||||
p->bom_strip_position = 0xff;
|
||||
}
|
||||
}
|
||||
}
|
||||
p->curr_buf = buf;
|
||||
p->curr_buf_length = length;
|
||||
p->curr_buf_pos = 0;
|
||||
@ -343,6 +362,7 @@ void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_
|
||||
|
||||
jv jv_parser_next(struct jv_parser* p) {
|
||||
assert(p->curr_buf && "a buffer must be provided");
|
||||
if (p->bom_strip_position == 0xff) return jv_invalid_with_msg(jv_string("Malformed BOM"));
|
||||
jv value;
|
||||
presult msg = 0;
|
||||
while (!msg && p->curr_buf_pos < p->curr_buf_length) {
|
||||
|
@ -6,6 +6,7 @@ struct jv_parser {
|
||||
int curr_buf_length;
|
||||
int curr_buf_pos;
|
||||
int curr_buf_is_partial;
|
||||
unsigned bom_strip_position;
|
||||
|
||||
jv* stack;
|
||||
int stackpos;
|
||||
|
7
testdata
7
testdata
@ -31,6 +31,13 @@ null
|
||||
null
|
||||
[]
|
||||
|
||||
# The input line starts with a 0xFEFF (byte order mark) codepoint
|
||||
# No, there is no reason to have a byte order mark in UTF8 text.
|
||||
# But apparently people do, so jq shouldn't break on it.
|
||||
.
|
||||
"byte order mark"
|
||||
"byte order mark"
|
||||
|
||||
# We test escapes by matching them against Unicode codepoints
|
||||
# FIXME: more tests needed for weird unicode stuff (e.g. utf16 pairs)
|
||||
"Aa\r\n\t\b\f\u03bc"
|
||||
|
Reference in New Issue
Block a user