UTF-8 encode and decode: Difference between revisions

Content added Content deleted
Line 162: Line 162:
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <inttypes.h>


typedef struct {
/*
char mask; /* the char data is in these bits */
* I have used binary literal notation to highlight that these constants are masks
char lead; /* the start bytes of a utf-8 encoded char */
* If your preferred compiler cannot parse binary literals, convert
uint32_t beg; /* beginning of codepoint range */
* the const chars below to something your compiler will recognize
uint32_t end; /* end of codepoint range */
*/
}utf_t;
const char utf_1_glyph_bits = 0b01111111;
const char utf_glyph_bits = 0b00111111;
const char utf_2_glyph_bits = 0b00011111;
const char utf_3_glyph_bits = 0b00001111;
const char utf_4_glyph_bits = 0b00000111;


utf_t * utf[] = {
const char utf_marker = 0b10000000;
[0] = &(utf_t){0b00111111, 0b10000000, 0, 0 },
const char utf_2byte_marker = 0b11000000;
[1] = &(utf_t){0b01111111, 0b00000000, 0000, 0177 },
const char utf_3byte_marker = 0b11100000;
[2] = &(utf_t){0b00011111, 0b11000000, 0200, 03777 },
const char utf_4byte_marker = 0b11110000;
[3] = &(utf_t){0b00001111, 0b11100000, 04000, 0177777 },
[4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777},
&(utf_t){0},
};


/* All lengths are in bytes */
int codepoint_len(const long cp);
int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */
int utf8_len(const char ch);
int utf8_len(const char ch); /* len of utf-8 encoded char */


char *codepoint2utf8(const long point);
char *to_utf8(const uint32_t cp);
long utf82codepoint(const char chr[4]);
uint32_t to_cp(const char chr[4]);


int codepoint_len(const long cp)
int codepoint_len(const uint32_t cp)
{
{
int len;
int len = 0;
int i = 1;
if((cp >= 0000) && (cp <= 0177)) {
for(utf_t **u = utf; u; ++u) {
len = 1;
} else if((cp >= 0200) && (cp <= 03777)) {
if((cp >= (*u)->beg) && (cp <= (*u)->end)) {
len = 2;
break;
}
} else if((cp >= 04000) && (cp <= 0177777)) {
len = 3;
++len;
}
} else if((cp >= 0200000) && (cp <= 04177777)) {
if(len > 4) /* Out of bounds */
len = 4;
exit(1);
} else { /* Out of bounds */

return len;
}

int utf8_len(const char ch)
{
int len = 0;
for(utf_t **u = utf; u; ++u) {
if((ch & ~(*u)->mask) == (*u)->lead) {
break;
}
++len;
}
if(len > 4) { /* Malformed leading byte */
exit(1);
exit(1);
}
}
Line 202: Line 218:
}
}


char *codepoint2utf8(const long point)
char *to_utf8(const uint32_t cp)
{
{
static char ret[5];
static char ret[5];
const int bytes = codepoint_len(point);
const int bytes = codepoint_len(cp);


int shift = 0;
switch(bytes) {
for(int i = bytes - 1; i; --i, shift += 6) {
case 1:
ret[0] = point & utf_1_glyph_bits;
ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead;
break;
case 2:
ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker;
ret[1] = (point & utf_glyph_bits) | utf_marker;
break;
case 3:
ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker;
ret[1] = (point >> 6 & utf_glyph_bits) | utf_marker;
ret[2] = (point & utf_glyph_bits) | utf_marker;
break;
case 4:
ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker;
ret[1] = (point >> 12 & utf_glyph_bits) | utf_marker;
ret[2] = (point >> 6 & utf_glyph_bits) | utf_marker;
ret[3] = (point & utf_glyph_bits) | utf_marker;
break;
default:
exit(1); /* Unreachable */
}
}
ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead;
ret[bytes] = '\0';
ret[bytes] = '\0';
return ret;
return ret;
}
}


int utf8_len(const char ch)
uint32_t to_cp(const char chr[4])
{
{
int len;
int bytes = utf8_len(*chr);
int shift = 6 * (bytes - 1);
if((ch & ~utf_1_glyph_bits) == 0) {
uint32_t codep = (*chr++ & utf[bytes]->mask) << shift;
len = 1;
} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) {
len = 2;
} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) {
len = 3;
} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) {
len = 4;
} else { /* Malformed leading byte */
exit(1);
}
return len;
}


for(int i = 1; i < bytes; ++i, ++chr) {
long utf82codepoint(const char chr[4])
shift -= 6;
{
int bytes = utf8_len(chr[0]);
codep |= ((char)*chr & utf[0]->mask) << shift;
signed long int codep;
switch(bytes) {
case 1:
codep = chr[0] & utf_1_glyph_bits;
break;
case 2:
codep = (chr[0] & utf_2_glyph_bits) << 6;
codep |= (chr[1] & utf_glyph_bits);
break;
case 3:
codep = (chr[0] & utf_3_glyph_bits) << 12;
codep |= (chr[1] & utf_glyph_bits ) << 6;
codep |= (chr[2] & utf_glyph_bits );
break;
case 4:
codep = (chr[0] & utf_4_glyph_bits) << 18;
codep |= (chr[1] & utf_glyph_bits ) << 12;
codep |= (chr[2] & utf_glyph_bits ) << 6;
codep |= (chr[3] & utf_glyph_bits );
break;
default:
exit(1);
}
}


Line 282: Line 248:
int main(void)
int main(void)
{
{
const long input[] = {0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};
const uint32_t *input = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};


printf("Character Unicode UTF-8 encoding (hex)\n");
printf("Character Unicode UTF-8 encoding (hex)\n");
Line 288: Line 254:


char *utf8;
char *utf8;
long codepoint;
uint32_t codepoint;
for(int i = 0; input[i]; ++i) {
for(; *input; ++input) {
utf8 = codepoint2utf8(input[i]);
utf8 = to_utf8(*input);
codepoint = utf82codepoint(utf8);
codepoint = to_cp(utf8);
printf("%s U+%-7.4lx", utf8, codepoint);
printf("%s U+%-7.4x", utf8, codepoint);


for(int i = 0; utf8[i] && i < 4; ++i) {
for(int i = 0; utf8[i] && i < 4; ++i) {
Line 311: Line 277:
€ U+20ac e2 82 ac
€ U+20ac e2 82 ac
𝄞 U+1d11e f0 9d 84 9e
𝄞 U+1d11e f0 9d 84 9e

</lang>
</lang>