UTF-8 encode and decode: Difference between revisions
Content added Content deleted
Line 162: | Line 162: | ||
#include <stdio.h> |
#include <stdio.h> |
||
#include <stdlib.h> |
#include <stdlib.h> |
||
#include <inttypes.h> |
|||
typedef struct { |
|||
/* |
|||
char mask; /* the char data is in these bits */ |
|||
* I have used binary literal notation to highlight that these constants are masks |
|||
char lead; /* the start bytes of a utf-8 encoded char */ |
|||
* If your preferred compiler cannot parse binary literals, convert |
|||
uint32_t beg; /* beginning of codepoint range */ |
|||
* the const chars below to something your compiler will recognize |
|||
uint32_t end; /* end of codepoint range */ |
|||
*/ |
|||
}utf_t; |
|||
const char utf_1_glyph_bits = 0b01111111; |
|||
const char utf_glyph_bits = 0b00111111; |
|||
const char utf_2_glyph_bits = 0b00011111; |
|||
const char utf_3_glyph_bits = 0b00001111; |
|||
const char utf_4_glyph_bits = 0b00000111; |
|||
utf_t * utf[] = { |
|||
const char utf_marker = 0b10000000; |
|||
[0] = &(utf_t){0b00111111, 0b10000000, 0, 0 }, |
|||
const char utf_2byte_marker = 0b11000000; |
|||
[1] = &(utf_t){0b01111111, 0b00000000, 0000, 0177 }, |
|||
const char utf_3byte_marker = 0b11100000; |
|||
[2] = &(utf_t){0b00011111, 0b11000000, 0200, 03777 }, |
|||
const char utf_4byte_marker = 0b11110000; |
|||
[3] = &(utf_t){0b00001111, 0b11100000, 04000, 0177777 }, |
|||
[4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777}, |
|||
&(utf_t){0}, |
|||
}; |
|||
/* All lengths are in bytes */ |
|||
int codepoint_len(const long cp); |
|||
int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */ |
|||
int utf8_len(const char ch); |
|||
int utf8_len(const char ch); /* len of utf-8 encoded char */ |
|||
char * |
char *to_utf8(const uint32_t cp); |
||
uint32_t to_cp(const char chr[4]); |
|||
int codepoint_len(const |
int codepoint_len(const uint32_t cp) |
||
{ |
{ |
||
int len; |
int len = 0; |
||
int i = 1; |
|||
if((cp >= 0000) && (cp <= 0177)) { |
|||
for(utf_t **u = utf; u; ++u) { |
|||
len = 1; |
|||
if((cp >= (*u)->beg) && (cp <= (*u)->end)) { |
|||
break; |
|||
} |
|||
} else if((cp >= 04000) && (cp <= 0177777)) { |
|||
len |
++len; |
||
} |
|||
} else if((cp >= 0200000) && (cp <= 04177777)) { |
|||
if(len > 4) /* Out of bounds */ |
|||
len = 4; |
|||
exit(1); |
|||
} else { /* Out of bounds */ |
|||
return len; |
|||
} |
|||
int utf8_len(const char ch) |
|||
{ |
|||
int len = 0; |
|||
for(utf_t **u = utf; u; ++u) { |
|||
if((ch & ~(*u)->mask) == (*u)->lead) { |
|||
break; |
|||
} |
|||
++len; |
|||
} |
|||
if(len > 4) { /* Malformed leading byte */ |
|||
exit(1); |
exit(1); |
||
} |
} |
||
Line 202: | Line 218: | ||
} |
} |
||
char * |
char *to_utf8(const uint32_t cp) |
||
{ |
{ |
||
static char ret[5]; |
static char ret[5]; |
||
const int bytes = codepoint_len( |
const int bytes = codepoint_len(cp); |
||
int shift = 0; |
|||
switch(bytes) { |
|||
for(int i = bytes - 1; i; --i, shift += 6) { |
|||
case 1: |
|||
ret[ |
ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead; |
||
break; |
|||
case 2: |
|||
ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker; |
|||
ret[1] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
case 3: |
|||
ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker; |
|||
ret[1] = (point >> 6 & utf_glyph_bits) | utf_marker; |
|||
ret[2] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
case 4: |
|||
ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker; |
|||
ret[1] = (point >> 12 & utf_glyph_bits) | utf_marker; |
|||
ret[2] = (point >> 6 & utf_glyph_bits) | utf_marker; |
|||
ret[3] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
default: |
|||
exit(1); /* Unreachable */ |
|||
} |
} |
||
ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead; |
|||
ret[bytes] = '\0'; |
ret[bytes] = '\0'; |
||
return ret; |
return ret; |
||
} |
} |
||
uint32_t to_cp(const char chr[4]) |
|||
{ |
{ |
||
int |
int bytes = utf8_len(*chr); |
||
int shift = 6 * (bytes - 1); |
|||
if((ch & ~utf_1_glyph_bits) == 0) { |
|||
uint32_t codep = (*chr++ & utf[bytes]->mask) << shift; |
|||
len = 1; |
|||
} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) { |
|||
len = 2; |
|||
} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) { |
|||
len = 3; |
|||
} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) { |
|||
len = 4; |
|||
} else { /* Malformed leading byte */ |
|||
exit(1); |
|||
} |
|||
return len; |
|||
} |
|||
for(int i = 1; i < bytes; ++i, ++chr) { |
|||
long utf82codepoint(const char chr[4]) |
|||
shift -= 6; |
|||
{ |
|||
codep |= ((char)*chr & utf[0]->mask) << shift; |
|||
signed long int codep; |
|||
switch(bytes) { |
|||
case 1: |
|||
codep = chr[0] & utf_1_glyph_bits; |
|||
break; |
|||
case 2: |
|||
codep = (chr[0] & utf_2_glyph_bits) << 6; |
|||
codep |= (chr[1] & utf_glyph_bits); |
|||
break; |
|||
case 3: |
|||
codep = (chr[0] & utf_3_glyph_bits) << 12; |
|||
codep |= (chr[1] & utf_glyph_bits ) << 6; |
|||
codep |= (chr[2] & utf_glyph_bits ); |
|||
break; |
|||
case 4: |
|||
codep = (chr[0] & utf_4_glyph_bits) << 18; |
|||
codep |= (chr[1] & utf_glyph_bits ) << 12; |
|||
codep |= (chr[2] & utf_glyph_bits ) << 6; |
|||
codep |= (chr[3] & utf_glyph_bits ); |
|||
break; |
|||
default: |
|||
exit(1); |
|||
} |
} |
||
Line 282: | Line 248: | ||
int main(void) |
int main(void) |
||
{ |
{ |
||
const |
const uint32_t *input = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0}; |
||
printf("Character Unicode UTF-8 encoding (hex)\n"); |
printf("Character Unicode UTF-8 encoding (hex)\n"); |
||
Line 288: | Line 254: | ||
char *utf8; |
char *utf8; |
||
uint32_t codepoint; |
|||
for( |
for(; *input; ++input) { |
||
utf8 = |
utf8 = to_utf8(*input); |
||
codepoint = |
codepoint = to_cp(utf8); |
||
printf("%s U+%-7. |
printf("%s U+%-7.4x", utf8, codepoint); |
||
for(int i = 0; utf8[i] && i < 4; ++i) { |
for(int i = 0; utf8[i] && i < 4; ++i) { |
||
Line 311: | Line 277: | ||
€ U+20ac e2 82 ac |
€ U+20ac e2 82 ac |
||
𝄞 U+1d11e f0 9d 84 9e |
𝄞 U+1d11e f0 9d 84 9e |
||
</lang> |
</lang> |
||