mirror of
				https://github.com/stedolan/jq.git
				synced 2024-05-11 05:55:39 +00:00 
			
		
		
		
	jq now depends on oniguruma for regex support. Modified configure.ac accordingly. Added valgrind suppression file for oniguruma to prevent one-time and bounded leaks from causing tests to fail. Signed-off-by: Nicolas Williams <nico@cryptonector.com>
		
			
				
	
	
		
			97 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			97 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
#include <stdio.h>
 | 
						|
#include <assert.h>
 | 
						|
#include "jv_unicode.h"
 | 
						|
#include "jv_utf8_tables.h"
 | 
						|
 | 
						|
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
 | 
						|
  assert(in <= end);
 | 
						|
  if (in == end) {
 | 
						|
    return 0;
 | 
						|
  }
 | 
						|
  int codepoint = -1;
 | 
						|
  unsigned char first = (unsigned char)in[0];
 | 
						|
  int length = utf8_coding_length[first];
 | 
						|
  if ((first & 0x80) == 0) {
 | 
						|
    /* Fast-path for ASCII */
 | 
						|
    codepoint = first;
 | 
						|
    length = 1;
 | 
						|
  } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
 | 
						|
    /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
 | 
						|
    length = 1;
 | 
						|
  } else if (in + length > end) {
 | 
						|
    /* String ends before UTF8 sequence ends */
 | 
						|
    length = end - in;
 | 
						|
  } else {
 | 
						|
    codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
 | 
						|
    for (int i=1; i<length; i++) {
 | 
						|
      unsigned ch = (unsigned char)in[i];
 | 
						|
      if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
 | 
						|
        /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
 | 
						|
        codepoint = -1;
 | 
						|
        length = i;
 | 
						|
        break;
 | 
						|
      }
 | 
						|
      codepoint = (codepoint << 6) | (ch & 0x3f);
 | 
						|
    }
 | 
						|
    if (codepoint < utf8_first_codepoint[length]) {
 | 
						|
      /* Overlong UTF8 sequence */
 | 
						|
      codepoint = -1;
 | 
						|
    }
 | 
						|
    if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
 | 
						|
      /* Surrogate codepoints can't be encoded in UTF8 */
 | 
						|
      codepoint = -1;
 | 
						|
    }
 | 
						|
    if (codepoint > 0x10FFFF) {
 | 
						|
      /* Outside Unicode range */
 | 
						|
      codepoint = -1;
 | 
						|
    }
 | 
						|
  }
 | 
						|
  assert(length > 0);
 | 
						|
  *codepoint_ret = codepoint;
 | 
						|
  return in + length;
 | 
						|
}
 | 
						|
 | 
						|
int jvp_utf8_is_valid(const char* in, const char* end) {
 | 
						|
  int codepoint;
 | 
						|
  while ((in = jvp_utf8_next(in, end, &codepoint))) {
 | 
						|
    if (codepoint == -1) return 0;
 | 
						|
  }
 | 
						|
  return 1;
 | 
						|
}
 | 
						|
 | 
						|
int jvp_utf8_decode_length(char startchar) {
 | 
						|
	if ((startchar & 0x80) == 0) return 1;
 | 
						|
	else if ((startchar & 0xC0) == 0xC0) return 2;
 | 
						|
	else if ((startchar & 0xE0) == 0xE0) return 3;
 | 
						|
	else return 4;
 | 
						|
}
 | 
						|
 | 
						|
int jvp_utf8_encode_length(int codepoint) {
 | 
						|
  if (codepoint <= 0x7F) return 1;
 | 
						|
  else if (codepoint <= 0x7FF) return 2;
 | 
						|
  else if (codepoint <= 0xFFFF) return 3;
 | 
						|
  else return 4;
 | 
						|
}
 | 
						|
 | 
						|
int jvp_utf8_encode(int codepoint, char* out) {
 | 
						|
  assert(codepoint >= 0 && codepoint <= 0x10FFFF);
 | 
						|
  char* start = out;
 | 
						|
  if (codepoint <= 0x7F) {
 | 
						|
    *out++ = codepoint;
 | 
						|
  } else if (codepoint <= 0x7FF) {
 | 
						|
    *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
 | 
						|
    *out++ = 0x80 + ((codepoint & 0x03F));
 | 
						|
  } else if(codepoint <= 0xFFFF) {
 | 
						|
    *out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
 | 
						|
    *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
 | 
						|
    *out++ = 0x80 + ((codepoint & 0x003F));
 | 
						|
  } else {
 | 
						|
    *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18);
 | 
						|
    *out++ = 0x80 + ((codepoint & 0x03F000) >> 12);
 | 
						|
    *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6);
 | 
						|
    *out++ = 0x80 + ((codepoint & 0x00003F));
 | 
						|
  }
 | 
						|
  assert(out - start == jvp_utf8_encode_length(codepoint));
 | 
						|
  return out - start;
 | 
						|
}
 |