UTF-8 encode and decode: Difference between revisions

Content added Content deleted

Inline

@@ Line 156: / Line 156: @@
 character 𝄞, code point:  1D11E, utf-8: F0 9D 84 9E
 T
+</lang>
+=={{header|C}}==
+<lang C>
+#include <stdio.h>
+#include <stdlib.h>
+/*
+ * I have used binary literal notation to highlight that these constants are masks
+ * If your preferred compiler cannot parse binary literals, convert
+ * the const chars below to something your compiler will recognize
+ */
+const char utf_1_glyph_bits = 0b01111111;
+const char utf_glyph_bits   = 0b00111111;
+const char utf_2_glyph_bits = 0b00011111;
+const char utf_3_glyph_bits = 0b00001111;
+const char utf_4_glyph_bits = 0b00000111;
+const char utf_marker       = 0b10000000;
+const char utf_2byte_marker = 0b11000000;
+const char utf_3byte_marker = 0b11100000;
+const char utf_4byte_marker = 0b11110000;
+int codepoint_len(const long cp);
+int utf8_len(const char ch);
+char *codepoint2utf8(const long point);
+long utf82codepoint(const char chr[4]);
+int codepoint_len(const long cp)
+{
+	int len;
+	if((cp >= 0000) && (cp <= 0177)) {
+		len = 1;
+	} else if((cp >= 0200) && (cp <= 03777)) {
+		len = 2;
+	} else if((cp >= 04000) && (cp <= 0177777)) {
+		len = 3;
+	} else if((cp >= 0200000) && (cp <= 04177777)) {
+		len = 4;
+	} else { /* Out of bounds */
+		exit(1);
+	}
+	return len;
+}
+char *codepoint2utf8(const long point)
+{
+	static char ret[5];
+	const int bytes = codepoint_len(point);
+	switch(bytes) {
+	case 1:
+		ret[0] = point & utf_1_glyph_bits;
+		break;
+	case 2:
+		ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker;
+		ret[1] = (point    & utf_glyph_bits)   | utf_marker;
+		break;
+	case 3:
+		ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker;
+		ret[1] = (point >> 6  & utf_glyph_bits)   | utf_marker;
+		ret[2] = (point     & utf_glyph_bits)   | utf_marker;
+		break;
+	case 4:
+		ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker;
+		ret[1] = (point >> 12 & utf_glyph_bits)   | utf_marker;
+		ret[2] = (point >> 6  & utf_glyph_bits)   | utf_marker;
+		ret[3] = (point     & utf_glyph_bits)   | utf_marker;
+		break;
+	default:
+		exit(1); /* Unreachable */
+	}
+	ret[bytes] = '\0';
+	return ret;
+}
+int utf8_len(const char ch)
+{
+	int len;
+	if((ch & ~utf_1_glyph_bits) == 0) {
+		len = 1;
+	} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) {
+		len = 2;
+	} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) {
+		len = 3;
+	} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) {
+		len = 4;
+	} else { /* Malformed leading byte */
+		exit(1);
+	}
+	return len;
+}
+long utf82codepoint(const char chr[4])
+{
+	int bytes = utf8_len(chr[0]);
+	signed long int codep;
+	switch(bytes) {
+	case 1:
+		codep = chr[0] & utf_1_glyph_bits;
+		break;
+	case 2:
+		codep  = (chr[0] & utf_2_glyph_bits) << 6;
+		codep |= (chr[1] & utf_glyph_bits);
+		break;
+	case 3:
+		codep  = (chr[0] & utf_3_glyph_bits) << 12;
+		codep |= (chr[1] & utf_glyph_bits  ) << 6;
+		codep |= (chr[2] & utf_glyph_bits  );
+		break;
+	case 4:
+		codep  = (chr[0] & utf_4_glyph_bits) << 18;
+		codep |= (chr[1] & utf_glyph_bits  ) << 12;
+		codep |= (chr[2] & utf_glyph_bits  ) << 6;
+		codep |= (chr[3] & utf_glyph_bits  );
+		break;
+	default:
+		exit(1);
+	}
+	return codep;
+}
+int main(void)
+{
+	const long input[] = {0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};
+	printf("Character  Unicode  UTF-8 encoding (hex)\n");
+	printf("----------------------------------------\n");
+	char *utf8;
+	long codepoint;
+	for(int i = 0; input[i]; ++i) {
+		utf8 = codepoint2utf8(input[i]);
+		codepoint = utf82codepoint(utf8);
+		printf("%s          U+%-7.4lx", utf8, codepoint);
+		for(int i = 0; utf8[i] && i < 4; ++i) {
+			printf("%hhx ", utf8[i]);
+		}
+		printf("\n");
+	}
+	return 0;
+}
+</lang>
+Output
+<lang>
+Character  Unicode  UTF-8 encoding (hex)
+----------------------------------------
+A          U+0041   41
+ö          U+00f6   c3 b6
+Ж          U+0416   d0 96
+€          U+20ac   e2 82 ac
+𝄞          U+1d11e  f0 9d 84 9e
 </lang>