Move some unicode handling stuff to a separate file.

2024-05-11 05:55:39 +00:00 · 2012-09-04 20:34:43 +01:00
parent 328c4a13f1
commit c5ab3b2336
4 changed files with 40 additions and 23 deletions
--- a/c/Makefile
+++ b/c/Makefile
@ -17,7 +17,7 @@ parser.tab.c: parser.y lexer.yy.h
 	bison -W -d parser.y -v --report-file=parser.info
 parser.tab.h: parser.tab.c

-parsertest: parser.tab.c lexer.yy.c main.c opcode.c bytecode.c compile.c execute.c builtin.c jv.c jv_parse.c jv_print.c jv_dtoa.c
+parsertest: parser.tab.c lexer.yy.c main.c opcode.c bytecode.c compile.c execute.c builtin.c jv.c jv_parse.c jv_print.c jv_dtoa.c jv_unicode.c
 	$(CC) -o $@ $^

 jv_test: jv_test.c jv.c jv_print.c jv_dtoa.c
--- a/c/jv_parse.c
+++ b/c/jv_parse.c
@ -4,6 +4,7 @@
 #include "jv.h"
 #include "jv_dtoa.h"
 #include "jv_parse.h"
+#include "jv_unicode.h"

 typedef const char* presult;

@ -157,27 +158,6 @@ static int unhex4(char* hex) {
  return r;
 }

-static int utf8_encode(int codepoint, char* out) {
-  assert(codepoint >= 0 && codepoint <= 0x10FFFF);
-  char* start = out;
-  if (codepoint <= 0x7F) {
-    *out++ = codepoint;
-  } else if (codepoint <= 0x7FF) {
-    *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
-    *out++ = 0x80 + ((codepoint & 0x03F));
-  } else if(codepoint <= 0xFFFF) {
-    *out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
-    *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
-    *out++ = 0x80 + ((codepoint & 0x003F));
-  } else {
-    *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18);
-    *out++ = 0x80 + ((codepoint & 0x03F000) >> 12);
-    *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6);
-    *out++ = 0x80 + ((codepoint & 0x00003F));
-  }
-  return out - start;
-}
-
 static pfunc found_string(struct jv_parser* p) {
  char* in = p->tokenbuf;
  char* out = p->tokenbuf;
@ -217,7 +197,7 @@ static pfunc found_string(struct jv_parser* p) {
                                 |(surrogate - 0xDC00));
        }
        // FIXME assert valid codepoint
-        out += utf8_encode(codepoint, out);
+        out += jvp_utf8_encode(codepoint, out);
        break;

      default:
--- a/c/jv_unicode.c
+++ b/c/jv_unicode.c
@ -0,0 +1,30 @@
+#include <assert.h>
+#include "jv_unicode.h"
+int jvp_utf8_encode_length(int codepoint) {
+  if (codepoint <= 0x7F) return 1;
+  else if (codepoint <= 0x7FF) return 2;
+  else if (codepoint <= 0xFFFF) return 3;
+  else return 4;
+}
+
+int jvp_utf8_encode(int codepoint, char* out) {
+  assert(codepoint >= 0 && codepoint <= 0x10FFFF);
+  char* start = out;
+  if (codepoint <= 0x7F) {
+    *out++ = codepoint;
+  } else if (codepoint <= 0x7FF) {
+    *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
+    *out++ = 0x80 + ((codepoint & 0x03F));
+  } else if(codepoint <= 0xFFFF) {
+    *out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
+    *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
+    *out++ = 0x80 + ((codepoint & 0x003F));
+  } else {
+    *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18);
+    *out++ = 0x80 + ((codepoint & 0x03F000) >> 12);
+    *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6);
+    *out++ = 0x80 + ((codepoint & 0x00003F));
+  }
+  assert(out - start == jvp_utf8_encode_length(codepoint));
+  return out - start;
+}
--- a/c/jv_unicode.h
+++ b/c/jv_unicode.h
@ -0,0 +1,7 @@
+#ifndef JV_UNICODE_H
+#define JV_UNICODE_H
+int jvp_utf8_decode_length(char startchar);
+
+int jvp_utf8_encode_length(int codepoint);
+int jvp_utf8_encode(int codepoint, char* out);
+#endif