Jump to content

UTF-8 encode and decode: Difference between revisions

Line 162:
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
 
typedef struct {
/*
char mask; /* the char data is in these bits */
* I have used binary literal notation to highlight that these constants are masks
char lead; /* the start bytes of a utf-8 encoded char */
* If your preferred compiler cannot parse binary literals, convert
uint32_t beg; /* beginning of codepoint range */
* the const chars below to something your compiler will recognize
uint32_t end; /* end of codepoint range */
*/
}utf_t;
const char utf_1_glyph_bits = 0b01111111;
const char utf_glyph_bits = 0b00111111;
const char utf_2_glyph_bits = 0b00011111;
const char utf_3_glyph_bits = 0b00001111;
const char utf_4_glyph_bits = 0b00000111;
 
utf_t * utf[] = {
const char utf_marker = 0b10000000;
[0] = &(utf_t){0b00111111, 0b10000000, 0, 0 },
const char utf_2byte_marker = 0b11000000;
[1] = &(utf_t){0b01111111, 0b00000000, 0000, 0177 },
const char utf_3byte_marker = 0b11100000;
[2] = &(utf_t){0b00011111, 0b11000000, 0200, 03777 },
const char utf_4byte_marker = 0b11110000;
[3] = &(utf_t){0b00001111, 0b11100000, 04000, 0177777 },
[4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777},
&(utf_t){0},
};
 
/* All lengths are in bytes */
int codepoint_len(const long cp);
int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */
int utf8_len(const char ch);
int utf8_len(const char ch); /* len of utf-8 encoded char */
 
char *codepoint2utf8to_utf8(const longuint32_t pointcp);
longuint32_t utf82codepointto_cp(const char chr[4]);
 
int codepoint_len(const longuint32_t cp)
{
int len = 0;
int i = 1;
if((cp >= 0000) && (cp <= 0177)) {
for(utf_t **u = utf; u; ++u) {
len = 1;
} else if((cp >= 0200(*u)->beg) && (cp <= 03777(*u)->end)) {
len = 2 break;
}
} else if((cp >= 04000) && (cp <= 0177777)) {
++len = 3;
}
} else if((cp >= 0200000) && (cp <= 04177777)) {
if(len > 4) /* Out of bounds */
len = 4;
exit(1);
} else { /* Out of bounds */
 
return len;
}
 
int utf8_len(const char ch)
{
int len = 0;
for(utf_t **u = utf; u; ++u) {
if((ch & ~(*u)->mask) == (*u)->lead) {
break;
}
++len;
}
if(len > 4) { /* Malformed leading byte */
exit(1);
}
Line 202 ⟶ 218:
}
 
char *codepoint2utf8to_utf8(const longuint32_t pointcp)
{
static char ret[5];
const int bytes = codepoint_len(pointcp);
 
int shift = 0;
switch(bytes) {
for(int i = bytes - 1; i; --i, shift += 6) {
case 1:
ret[0i] = point(cp >> shift & utf_1_glyph_bitsutf[0]->mask) | utf[0]->lead;
break;
case 2:
ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker;
ret[1] = (point & utf_glyph_bits) | utf_marker;
break;
case 3:
ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker;
ret[1] = (point >> 6 & utf_glyph_bits) | utf_marker;
ret[2] = (point & utf_glyph_bits) | utf_marker;
break;
case 4:
ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker;
ret[1] = (point >> 12 & utf_glyph_bits) | utf_marker;
ret[2] = (point >> 6 & utf_glyph_bits) | utf_marker;
ret[3] = (point & utf_glyph_bits) | utf_marker;
break;
default:
exit(1); /* Unreachable */
}
ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead;
ret[bytes] = '\0';
return ret;
}
 
intuint32_t utf8_lento_cp(const char chchr[4])
{
int lenbytes = utf8_len(*chr);
int shift = 6 * (bytes - 1);
if((ch & ~utf_1_glyph_bits) == 0) {
uint32_t codep = (*chr++ & utf[bytes]->mask) << shift;
len = 1;
} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) {
len = 2;
} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) {
len = 3;
} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) {
len = 4;
} else { /* Malformed leading byte */
exit(1);
}
return len;
}
 
for(int i = 1; i < bytes; ++i, ++chr) {
long utf82codepoint(const char chr[4])
shift -= 6;
{
int bytes codep |= utf8_len((char)*chr & utf[0]->mask) << shift;
signed long int codep;
switch(bytes) {
case 1:
codep = chr[0] & utf_1_glyph_bits;
break;
case 2:
codep = (chr[0] & utf_2_glyph_bits) << 6;
codep |= (chr[1] & utf_glyph_bits);
break;
case 3:
codep = (chr[0] & utf_3_glyph_bits) << 12;
codep |= (chr[1] & utf_glyph_bits ) << 6;
codep |= (chr[2] & utf_glyph_bits );
break;
case 4:
codep = (chr[0] & utf_4_glyph_bits) << 18;
codep |= (chr[1] & utf_glyph_bits ) << 12;
codep |= (chr[2] & utf_glyph_bits ) << 6;
codep |= (chr[3] & utf_glyph_bits );
break;
default:
exit(1);
}
 
Line 282 ⟶ 248:
int main(void)
{
const longuint32_t *input[] = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};
 
printf("Character Unicode UTF-8 encoding (hex)\n");
Line 288 ⟶ 254:
 
char *utf8;
longuint32_t codepoint;
for(int i = 0; *input[i]; ++iinput) {
utf8 = codepoint2utf8to_utf8(*input[i]);
codepoint = utf82codepointto_cp(utf8);
printf("%s U+%-7.4lx4x", utf8, codepoint);
 
for(int i = 0; utf8[i] && i < 4; ++i) {
Line 311 ⟶ 277:
€ U+20ac e2 82 ac
𝄞 U+1d11e f0 9d 84 9e
 
</lang>
 
Anonymous user
Cookies help us deliver our services. By using our services, you agree to our use of cookies.