UTF-8 encode and decode: Difference between revisions

Content added Content deleted
(Added solution for D)
Line 156: Line 156:
character π„ž, code point: 1D11E, utf-8: F0 9D 84 9E
character π„ž, code point: 1D11E, utf-8: F0 9D 84 9E
T
T
</lang>

=={{header|C}}==
<lang C>
#include <stdio.h>
#include <stdlib.h>

/*
* I have used binary literal notation to highlight that these constants are masks
* If your preferred compiler cannot parse binary literals, convert
* the const chars below to something your compiler will recognize
*/
const char utf_1_glyph_bits = 0b01111111;
const char utf_glyph_bits = 0b00111111;
const char utf_2_glyph_bits = 0b00011111;
const char utf_3_glyph_bits = 0b00001111;
const char utf_4_glyph_bits = 0b00000111;

const char utf_marker = 0b10000000;
const char utf_2byte_marker = 0b11000000;
const char utf_3byte_marker = 0b11100000;
const char utf_4byte_marker = 0b11110000;

int codepoint_len(const long cp);
int utf8_len(const char ch);

char *codepoint2utf8(const long point);
long utf82codepoint(const char chr[4]);

int codepoint_len(const long cp)
{
int len;
if((cp >= 0000) && (cp <= 0177)) {
len = 1;
} else if((cp >= 0200) && (cp <= 03777)) {
len = 2;
} else if((cp >= 04000) && (cp <= 0177777)) {
len = 3;
} else if((cp >= 0200000) && (cp <= 04177777)) {
len = 4;
} else { /* Out of bounds */
exit(1);
}
return len;
}

char *codepoint2utf8(const long point)
{
static char ret[5];
const int bytes = codepoint_len(point);

switch(bytes) {
case 1:
ret[0] = point & utf_1_glyph_bits;
break;
case 2:
ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker;
ret[1] = (point & utf_glyph_bits) | utf_marker;
break;
case 3:
ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker;
ret[1] = (point >> 6 & utf_glyph_bits) | utf_marker;
ret[2] = (point & utf_glyph_bits) | utf_marker;
break;
case 4:
ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker;
ret[1] = (point >> 12 & utf_glyph_bits) | utf_marker;
ret[2] = (point >> 6 & utf_glyph_bits) | utf_marker;
ret[3] = (point & utf_glyph_bits) | utf_marker;
break;
default:
exit(1); /* Unreachable */
}
ret[bytes] = '\0';
return ret;
}

int utf8_len(const char ch)
{
int len;
if((ch & ~utf_1_glyph_bits) == 0) {
len = 1;
} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) {
len = 2;
} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) {
len = 3;
} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) {
len = 4;
} else { /* Malformed leading byte */
exit(1);
}
return len;
}

long utf82codepoint(const char chr[4])
{
int bytes = utf8_len(chr[0]);
signed long int codep;
switch(bytes) {
case 1:
codep = chr[0] & utf_1_glyph_bits;
break;
case 2:
codep = (chr[0] & utf_2_glyph_bits) << 6;
codep |= (chr[1] & utf_glyph_bits);
break;
case 3:
codep = (chr[0] & utf_3_glyph_bits) << 12;
codep |= (chr[1] & utf_glyph_bits ) << 6;
codep |= (chr[2] & utf_glyph_bits );
break;
case 4:
codep = (chr[0] & utf_4_glyph_bits) << 18;
codep |= (chr[1] & utf_glyph_bits ) << 12;
codep |= (chr[2] & utf_glyph_bits ) << 6;
codep |= (chr[3] & utf_glyph_bits );
break;
default:
exit(1);
}

return codep;
}

int main(void)
{
const long input[] = {0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};

printf("Character Unicode UTF-8 encoding (hex)\n");
printf("----------------------------------------\n");

char *utf8;
long codepoint;
for(int i = 0; input[i]; ++i) {
utf8 = codepoint2utf8(input[i]);
codepoint = utf82codepoint(utf8);
printf("%s U+%-7.4lx", utf8, codepoint);

for(int i = 0; utf8[i] && i < 4; ++i) {
printf("%hhx ", utf8[i]);
}
printf("\n");
}
return 0;
}
</lang>
Output
<lang>
Character Unicode UTF-8 encoding (hex)
----------------------------------------
A U+0041 41
ΓΆ U+00f6 c3 b6
Π– U+0416 d0 96
€ U+20ac e2 82 ac
π„ž U+1d11e f0 9d 84 9e
</lang>
</lang>