mirror of
https://github.com/stedolan/jq.git
synced 2024-05-11 05:55:39 +00:00
Fix various UTF8 parsing bugs.
In particular, parse bad UTF8 by replacing the broken bits with U+FFFD and resychronise correctly after broken sequences.
This commit is contained in:
37
jv.c
37
jv.c
@@ -377,6 +377,32 @@ static jvp_string* jvp_string_alloc(uint32_t size) {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
|
||||||
|
static jv_nontrivial jvp_string_copy_replace_bad(const char* data, uint32_t length) {
|
||||||
|
const char* end = data + length;
|
||||||
|
const char* i = data;
|
||||||
|
const char* cstart;
|
||||||
|
|
||||||
|
uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
|
||||||
|
jvp_string* s = jvp_string_alloc(maxlength);
|
||||||
|
char* out = s->data;
|
||||||
|
int c = 0;
|
||||||
|
|
||||||
|
while ((i = jvp_utf8_next((cstart = i), end, &c))) {
|
||||||
|
if (c == -1) {
|
||||||
|
c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
|
||||||
|
}
|
||||||
|
out += jvp_utf8_encode(c, out);
|
||||||
|
assert(out < s->data + maxlength);
|
||||||
|
}
|
||||||
|
length = out - s->data;
|
||||||
|
s->data[length] = 0;
|
||||||
|
s->length_hashed = length << 1;
|
||||||
|
jv_nontrivial r = {&s->refcnt, {0,0}};
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Assumes valid UTF8 */
|
||||||
static jv_nontrivial jvp_string_new(const char* data, uint32_t length) {
|
static jv_nontrivial jvp_string_new(const char* data, uint32_t length) {
|
||||||
jvp_string* s = jvp_string_alloc(length);
|
jvp_string* s = jvp_string_alloc(length);
|
||||||
s->length_hashed = length << 1;
|
s->length_hashed = length << 1;
|
||||||
@@ -523,7 +549,9 @@ static int jvp_string_equal(jv_nontrivial* a, jv_nontrivial* b) {
|
|||||||
jv jv_string_sized(const char* str, int len) {
|
jv jv_string_sized(const char* str, int len) {
|
||||||
jv j;
|
jv j;
|
||||||
j.kind = JV_KIND_STRING;
|
j.kind = JV_KIND_STRING;
|
||||||
j.val.nontrivial = jvp_string_new(str, len);
|
j.val.nontrivial = jvp_utf8_is_valid(str, str+len) ?
|
||||||
|
jvp_string_new(str, len) :
|
||||||
|
jvp_string_copy_replace_bad(str, len);
|
||||||
return j;
|
return j;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -568,7 +596,14 @@ jv jv_string_concat(jv a, jv b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
jv jv_string_append_buf(jv a, const char* buf, int len) {
|
jv jv_string_append_buf(jv a, const char* buf, int len) {
|
||||||
|
if (jvp_utf8_is_valid(buf, buf+len)) {
|
||||||
jvp_string_append(&a.val.nontrivial, buf, len);
|
jvp_string_append(&a.val.nontrivial, buf, len);
|
||||||
|
} else {
|
||||||
|
jv b;
|
||||||
|
b.kind = JV_KIND_STRING;
|
||||||
|
b.val.nontrivial = jvp_string_copy_replace_bad(buf, len);
|
||||||
|
a = jv_string_concat(a, b);
|
||||||
|
}
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
51
jv_unicode.c
51
jv_unicode.c
@@ -3,35 +3,56 @@
|
|||||||
#include "jv_unicode.h"
|
#include "jv_unicode.h"
|
||||||
#include "jv_utf8_tables.h"
|
#include "jv_utf8_tables.h"
|
||||||
|
|
||||||
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint) {
|
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
|
||||||
|
assert(in <= end);
|
||||||
if (in == end) {
|
if (in == end) {
|
||||||
codepoint = 0;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
int codepoint = -1;
|
||||||
unsigned char first = (unsigned char)in[0];
|
unsigned char first = (unsigned char)in[0];
|
||||||
int length = utf8_coding_length[first];
|
int length = utf8_coding_length[first];
|
||||||
if (length == 0 || length == UTF8_CONTINUATION_BYTE || in + length > end) {
|
if ((first & 0x80) == 0) {
|
||||||
*codepoint = -1;
|
/* Fast-path for ASCII */
|
||||||
return 0;
|
codepoint = first;
|
||||||
}
|
length = 1;
|
||||||
*codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
|
} else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
|
||||||
|
/* Bad single byte - either an invalid byte or an out-of-place continuation byte */
|
||||||
|
length = 1;
|
||||||
|
} else if (in + length > end) {
|
||||||
|
/* String ends before UTF8 sequence ends */
|
||||||
|
length = end - in;
|
||||||
|
} else {
|
||||||
|
codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
|
||||||
for (int i=1; i<length; i++) {
|
for (int i=1; i<length; i++) {
|
||||||
int ch = (unsigned char)in[i];
|
unsigned ch = (unsigned char)in[i];
|
||||||
if (utf8_coding_length[(unsigned char)in[i]] != UTF8_CONTINUATION_BYTE){
|
if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
|
||||||
*codepoint = -1;
|
/* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
|
||||||
return 0;
|
codepoint = -1;
|
||||||
|
length = i;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
*codepoint = (*codepoint << 6) | (ch & 0x3f);
|
codepoint = (codepoint << 6) | (ch & 0x3f);
|
||||||
}
|
}
|
||||||
|
if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
|
||||||
|
/* Surrogate codepoints can't be encoded in UTF8 */
|
||||||
|
codepoint = -1;
|
||||||
|
}
|
||||||
|
if (codepoint > 0x10FFFF) {
|
||||||
|
/* Outside Unicode range */
|
||||||
|
codepoint = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(length > 0);
|
||||||
|
*codepoint_ret = codepoint;
|
||||||
return in + length;
|
return in + length;
|
||||||
}
|
}
|
||||||
|
|
||||||
int jvp_utf8_verify(const char* in, const char* end) {
|
int jvp_utf8_is_valid(const char* in, const char* end) {
|
||||||
int codepoint = 0;
|
int codepoint;
|
||||||
while ((in = jvp_utf8_next(in, end, &codepoint))) {
|
while ((in = jvp_utf8_next(in, end, &codepoint))) {
|
||||||
if (codepoint == -1) return 0;
|
if (codepoint == -1) return 0;
|
||||||
}
|
}
|
||||||
return codepoint != -1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int jvp_utf8_encode_length(int codepoint) {
|
int jvp_utf8_encode_length(int codepoint) {
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
#define JV_UNICODE_H
|
#define JV_UNICODE_H
|
||||||
|
|
||||||
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
|
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
|
||||||
|
int jvp_utf8_is_valid(const char* in, const char* end);
|
||||||
|
|
||||||
int jvp_utf8_decode_length(char startchar);
|
int jvp_utf8_decode_length(char startchar);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user