UTF-8 encode and decode: Difference between revisions

Content added Content deleted

Inline

@@ Line 162: / Line 162: @@
 #include <stdio.h>
 #include <stdlib.h>
+#include <inttypes.h>
+typedef struct {
-/*
+	char mask; /* the char data is in these bits */
- * I have used binary literal notation to highlight that these constants are masks
+	char lead; /* the start bytes of a utf-8 encoded char */
- * If your preferred compiler cannot parse binary literals, convert
+	uint32_t beg; /* beginning of codepoint range */
- * the const chars below to something your compiler will recognize
+	uint32_t end; /* end of codepoint range */
- */
+}utf_t;
-const char utf_1_glyph_bits = 0b01111111;
-const char utf_glyph_bits   = 0b00111111;
-const char utf_2_glyph_bits = 0b00011111;
-const char utf_3_glyph_bits = 0b00001111;
-const char utf_4_glyph_bits = 0b00000111;
+utf_t * utf[] = {
-const char utf_marker       = 0b10000000;
+	[0] = &(utf_t){0b00111111, 0b10000000, 0,       0       },
-const char utf_2byte_marker = 0b11000000;
+	[1] = &(utf_t){0b01111111, 0b00000000, 0000,    0177    },
-const char utf_3byte_marker = 0b11100000;
+	[2] = &(utf_t){0b00011111, 0b11000000, 0200,    03777   },
-const char utf_4byte_marker = 0b11110000;
+	[3] = &(utf_t){0b00001111, 0b11100000, 04000,   0177777 },
+	[4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777},
+	&(utf_t){0},
+};
+/* All lengths are in bytes */
-int codepoint_len(const long cp);
+int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */
-int utf8_len(const char ch);
+int utf8_len(const char ch);          /* len of utf-8 encoded char */
-char *codepoint2utf8(const long point);
+char *to_utf8(const uint32_t cp);
-long utf82codepoint(const char chr[4]);
+uint32_t to_cp(const char chr[4]);
-int codepoint_len(const long cp)
+int codepoint_len(const uint32_t cp)
 {
-	int len;
+	int len = 0;
+	int i = 1;
-	if((cp >= 0000) && (cp <= 0177)) {
+	for(utf_t **u = utf; u; ++u) {
-		len = 1;
-	} else if((cp >= 0200) && (cp <= 03777)) {
+		if((cp >= (*u)->beg) && (cp <= (*u)->end)) {
-		len = 2;
+			break;
+		}
-	} else if((cp >= 04000) && (cp <= 0177777)) {
-		len = 3;
+		++len;
+	}
-	} else if((cp >= 0200000) && (cp <= 04177777)) {
+	if(len > 4) /* Out of bounds */
-		len = 4;
+		exit(1);
-	} else { /* Out of bounds */
+	return len;
+}
+int utf8_len(const char ch)
+{
+	int len = 0;
+	for(utf_t **u = utf; u; ++u) {
+		if((ch & ~(*u)->mask) == (*u)->lead) {
+			break;
+		}
+		++len;
+	}
+	if(len > 4) { /* Malformed leading byte */
 		exit(1);
 	}
@@ Line 202: / Line 218: @@
 }
-char *codepoint2utf8(const long point)
+char *to_utf8(const uint32_t cp)
 {
 	static char ret[5];
-	const int bytes = codepoint_len(point);
+	const int bytes = codepoint_len(cp);
+	int shift = 0;
-	switch(bytes) {
+	for(int i = bytes - 1; i; --i, shift += 6) {
-	case 1:
-		ret[0] = point & utf_1_glyph_bits;
+		ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead;
-		break;
-	case 2:
-		ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker;
-		ret[1] = (point    & utf_glyph_bits)   | utf_marker;
-		break;
-	case 3:
-		ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker;
-		ret[1] = (point >> 6  & utf_glyph_bits)   | utf_marker;
-		ret[2] = (point     & utf_glyph_bits)   | utf_marker;
-		break;
-	case 4:
-		ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker;
-		ret[1] = (point >> 12 & utf_glyph_bits)   | utf_marker;
-		ret[2] = (point >> 6  & utf_glyph_bits)   | utf_marker;
-		ret[3] = (point     & utf_glyph_bits)   | utf_marker;
-		break;
-	default:
-		exit(1); /* Unreachable */
 	}
+	ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead;
 	ret[bytes] = '\0';
 	return ret;
 }
-int utf8_len(const char ch)
+uint32_t to_cp(const char chr[4])
 {
-	int len;
+	int bytes = utf8_len(*chr);
+	int shift = 6 * (bytes - 1);
-	if((ch & ~utf_1_glyph_bits) == 0) {
+	uint32_t codep = (*chr++ & utf[bytes]->mask) << shift;
-		len = 1;
-	} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) {
-		len = 2;
-	} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) {
-		len = 3;
-	} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) {
-		len = 4;
-	} else { /* Malformed leading byte */
-		exit(1);
-	}
-	return len;
-}
+	for(int i = 1; i < bytes; ++i, ++chr) {
-long utf82codepoint(const char chr[4])
+		shift -= 6;
-{
-	int bytes = utf8_len(chr[0]);
+		codep |= ((char)*chr & utf[0]->mask) << shift;
-	signed long int codep;
-	switch(bytes) {
-	case 1:
-		codep = chr[0] & utf_1_glyph_bits;
-		break;
-	case 2:
-		codep  = (chr[0] & utf_2_glyph_bits) << 6;
-		codep |= (chr[1] & utf_glyph_bits);
-		break;
-	case 3:
-		codep  = (chr[0] & utf_3_glyph_bits) << 12;
-		codep |= (chr[1] & utf_glyph_bits  ) << 6;
-		codep |= (chr[2] & utf_glyph_bits  );
-		break;
-	case 4:
-		codep  = (chr[0] & utf_4_glyph_bits) << 18;
-		codep |= (chr[1] & utf_glyph_bits  ) << 12;
-		codep |= (chr[2] & utf_glyph_bits  ) << 6;
-		codep |= (chr[3] & utf_glyph_bits  );
-		break;
-	default:
-		exit(1);
 	}
@@ Line 282: / Line 248: @@
 int main(void)
 {
-	const long input[] = {0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};
+	const uint32_t *input = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};
 	printf("Character  Unicode  UTF-8 encoding (hex)\n");
@@ Line 288: / Line 254: @@
 	char *utf8;
-	long codepoint;
+	uint32_t codepoint;
-	for(int i = 0; input[i]; ++i) {
+	for(; *input; ++input) {
-		utf8 = codepoint2utf8(input[i]);
+		utf8 = to_utf8(*input);
-		codepoint = utf82codepoint(utf8);
+		codepoint = to_cp(utf8);
-		printf("%s          U+%-7.4lx", utf8, codepoint);
+		printf("%s          U+%-7.4x", utf8, codepoint);
 		for(int i = 0; utf8[i] && i < 4; ++i) {
@@ Line 311: / Line 277: @@
 €          U+20ac   e2 82 ac
 𝄞          U+1d11e  f0 9d 84 9e
 </lang>