UTF-8 encode and decode: Difference between revisions
Content added Content deleted
(Added solution for D) |
|||
Line 156: | Line 156: | ||
character π, code point: 1D11E, utf-8: F0 9D 84 9E |
character π, code point: 1D11E, utf-8: F0 9D 84 9E |
||
T |
T |
||
</lang> |
|||
=={{header|C}}== |
|||
<lang C> |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
/* |
|||
* I have used binary literal notation to highlight that these constants are masks |
|||
* If your preferred compiler cannot parse binary literals, convert |
|||
* the const chars below to something your compiler will recognize |
|||
*/ |
|||
const char utf_1_glyph_bits = 0b01111111; |
|||
const char utf_glyph_bits = 0b00111111; |
|||
const char utf_2_glyph_bits = 0b00011111; |
|||
const char utf_3_glyph_bits = 0b00001111; |
|||
const char utf_4_glyph_bits = 0b00000111; |
|||
const char utf_marker = 0b10000000; |
|||
const char utf_2byte_marker = 0b11000000; |
|||
const char utf_3byte_marker = 0b11100000; |
|||
const char utf_4byte_marker = 0b11110000; |
|||
int codepoint_len(const long cp); |
|||
int utf8_len(const char ch); |
|||
char *codepoint2utf8(const long point); |
|||
long utf82codepoint(const char chr[4]); |
|||
int codepoint_len(const long cp) |
|||
{ |
|||
int len; |
|||
if((cp >= 0000) && (cp <= 0177)) { |
|||
len = 1; |
|||
} else if((cp >= 0200) && (cp <= 03777)) { |
|||
len = 2; |
|||
} else if((cp >= 04000) && (cp <= 0177777)) { |
|||
len = 3; |
|||
} else if((cp >= 0200000) && (cp <= 04177777)) { |
|||
len = 4; |
|||
} else { /* Out of bounds */ |
|||
exit(1); |
|||
} |
|||
return len; |
|||
} |
|||
char *codepoint2utf8(const long point) |
|||
{ |
|||
static char ret[5]; |
|||
const int bytes = codepoint_len(point); |
|||
switch(bytes) { |
|||
case 1: |
|||
ret[0] = point & utf_1_glyph_bits; |
|||
break; |
|||
case 2: |
|||
ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker; |
|||
ret[1] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
case 3: |
|||
ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker; |
|||
ret[1] = (point >> 6 & utf_glyph_bits) | utf_marker; |
|||
ret[2] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
case 4: |
|||
ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker; |
|||
ret[1] = (point >> 12 & utf_glyph_bits) | utf_marker; |
|||
ret[2] = (point >> 6 & utf_glyph_bits) | utf_marker; |
|||
ret[3] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
default: |
|||
exit(1); /* Unreachable */ |
|||
} |
|||
ret[bytes] = '\0'; |
|||
return ret; |
|||
} |
|||
int utf8_len(const char ch) |
|||
{ |
|||
int len; |
|||
if((ch & ~utf_1_glyph_bits) == 0) { |
|||
len = 1; |
|||
} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) { |
|||
len = 2; |
|||
} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) { |
|||
len = 3; |
|||
} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) { |
|||
len = 4; |
|||
} else { /* Malformed leading byte */ |
|||
exit(1); |
|||
} |
|||
return len; |
|||
} |
|||
long utf82codepoint(const char chr[4]) |
|||
{ |
|||
int bytes = utf8_len(chr[0]); |
|||
signed long int codep; |
|||
switch(bytes) { |
|||
case 1: |
|||
codep = chr[0] & utf_1_glyph_bits; |
|||
break; |
|||
case 2: |
|||
codep = (chr[0] & utf_2_glyph_bits) << 6; |
|||
codep |= (chr[1] & utf_glyph_bits); |
|||
break; |
|||
case 3: |
|||
codep = (chr[0] & utf_3_glyph_bits) << 12; |
|||
codep |= (chr[1] & utf_glyph_bits ) << 6; |
|||
codep |= (chr[2] & utf_glyph_bits ); |
|||
break; |
|||
case 4: |
|||
codep = (chr[0] & utf_4_glyph_bits) << 18; |
|||
codep |= (chr[1] & utf_glyph_bits ) << 12; |
|||
codep |= (chr[2] & utf_glyph_bits ) << 6; |
|||
codep |= (chr[3] & utf_glyph_bits ); |
|||
break; |
|||
default: |
|||
exit(1); |
|||
} |
|||
return codep; |
|||
} |
|||
int main(void) |
|||
{ |
|||
const long input[] = {0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0}; |
|||
printf("Character Unicode UTF-8 encoding (hex)\n"); |
|||
printf("----------------------------------------\n"); |
|||
char *utf8; |
|||
long codepoint; |
|||
for(int i = 0; input[i]; ++i) { |
|||
utf8 = codepoint2utf8(input[i]); |
|||
codepoint = utf82codepoint(utf8); |
|||
printf("%s U+%-7.4lx", utf8, codepoint); |
|||
for(int i = 0; utf8[i] && i < 4; ++i) { |
|||
printf("%hhx ", utf8[i]); |
|||
} |
|||
printf("\n"); |
|||
} |
|||
return 0; |
|||
} |
|||
</lang> |
|||
Output |
|||
<lang> |
|||
Character Unicode UTF-8 encoding (hex) |
|||
---------------------------------------- |
|||
A U+0041 41 |
|||
ΓΆ U+00f6 c3 b6 |
|||
Π U+0416 d0 96 |
|||
β¬ U+20ac e2 82 ac |
|||
π U+1d11e f0 9d 84 9e |
|||
</lang> |
</lang> |
||