UTF-8 encode and decode: Difference between revisions

(Added solution for D)
Line 156:
character 𝄞, code point: 1D11E, utf-8: F0 9D 84 9E
T
</lang>
 
=={{header|C}}==
<lang C>
#include <stdio.h>
#include <stdlib.h>
 
/*
* I have used binary literal notation to highlight that these constants are masks
* If your preferred compiler cannot parse binary literals, convert
* the const chars below to something your compiler will recognize
*/
const char utf_1_glyph_bits = 0b01111111;
const char utf_glyph_bits = 0b00111111;
const char utf_2_glyph_bits = 0b00011111;
const char utf_3_glyph_bits = 0b00001111;
const char utf_4_glyph_bits = 0b00000111;
 
const char utf_marker = 0b10000000;
const char utf_2byte_marker = 0b11000000;
const char utf_3byte_marker = 0b11100000;
const char utf_4byte_marker = 0b11110000;
 
int codepoint_len(const long cp);
int utf8_len(const char ch);
 
char *codepoint2utf8(const long point);
long utf82codepoint(const char chr[4]);
 
int codepoint_len(const long cp)
{
int len;
if((cp >= 0000) && (cp <= 0177)) {
len = 1;
} else if((cp >= 0200) && (cp <= 03777)) {
len = 2;
} else if((cp >= 04000) && (cp <= 0177777)) {
len = 3;
} else if((cp >= 0200000) && (cp <= 04177777)) {
len = 4;
} else { /* Out of bounds */
exit(1);
}
return len;
}
 
char *codepoint2utf8(const long point)
{
static char ret[5];
const int bytes = codepoint_len(point);
 
switch(bytes) {
case 1:
ret[0] = point & utf_1_glyph_bits;
break;
case 2:
ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker;
ret[1] = (point & utf_glyph_bits) | utf_marker;
break;
case 3:
ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker;
ret[1] = (point >> 6 & utf_glyph_bits) | utf_marker;
ret[2] = (point & utf_glyph_bits) | utf_marker;
break;
case 4:
ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker;
ret[1] = (point >> 12 & utf_glyph_bits) | utf_marker;
ret[2] = (point >> 6 & utf_glyph_bits) | utf_marker;
ret[3] = (point & utf_glyph_bits) | utf_marker;
break;
default:
exit(1); /* Unreachable */
}
ret[bytes] = '\0';
return ret;
}
 
int utf8_len(const char ch)
{
int len;
if((ch & ~utf_1_glyph_bits) == 0) {
len = 1;
} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) {
len = 2;
} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) {
len = 3;
} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) {
len = 4;
} else { /* Malformed leading byte */
exit(1);
}
return len;
}
 
long utf82codepoint(const char chr[4])
{
int bytes = utf8_len(chr[0]);
signed long int codep;
switch(bytes) {
case 1:
codep = chr[0] & utf_1_glyph_bits;
break;
case 2:
codep = (chr[0] & utf_2_glyph_bits) << 6;
codep |= (chr[1] & utf_glyph_bits);
break;
case 3:
codep = (chr[0] & utf_3_glyph_bits) << 12;
codep |= (chr[1] & utf_glyph_bits ) << 6;
codep |= (chr[2] & utf_glyph_bits );
break;
case 4:
codep = (chr[0] & utf_4_glyph_bits) << 18;
codep |= (chr[1] & utf_glyph_bits ) << 12;
codep |= (chr[2] & utf_glyph_bits ) << 6;
codep |= (chr[3] & utf_glyph_bits );
break;
default:
exit(1);
}
 
return codep;
}
 
int main(void)
{
const long input[] = {0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};
 
printf("Character Unicode UTF-8 encoding (hex)\n");
printf("----------------------------------------\n");
 
char *utf8;
long codepoint;
for(int i = 0; input[i]; ++i) {
utf8 = codepoint2utf8(input[i]);
codepoint = utf82codepoint(utf8);
printf("%s U+%-7.4lx", utf8, codepoint);
 
for(int i = 0; utf8[i] && i < 4; ++i) {
printf("%hhx ", utf8[i]);
}
printf("\n");
}
return 0;
}
</lang>
Output
<lang>
Character Unicode UTF-8 encoding (hex)
----------------------------------------
A U+0041 41
ö U+00f6 c3 b6
Ж U+0416 d0 96
€ U+20ac e2 82 ac
𝄞 U+1d11e f0 9d 84 9e
</lang>
 
Anonymous user