UTF-8 encode and decode: Difference between revisions

→‎{{header|C}}: rework to_utf8(), remove magic nums, and better main() loop
(→‎{{header|C}}: rework to_utf8(), remove magic nums, and better main() loop)
Line 165:
 
typedef struct {
char mask; /* the char data iswill be bitwise inAND thesewith bitsthis */
char lead; /* the start bytes of acurrent char in utf-8 encoded charcharacter */
uint32_t beg; /* beginning of codepoint range */
uint32_t end; /* end of codepoint range */
int bits_stored; /* the number of bits from the codepoint that fits in char */
}utf_t;
 
utf_t * utf[] = {
/* mask lead beg end bits */
[0] = &(utf_t){0b00111111, 0b10000000, 0, 0 },
[10] = &(utf_t){0b011111110b00111111, 0b000000000b10000000, 00000, 0177 0, 6 },
[21] = &(utf_t){0b000111110b01111111, 0b110000000b00000000, 02000000, 037770177, 7 },
[32] = &(utf_t){0b000011110b00011111, 0b111000000b11000000, 040000200, 0177777 03777, 5 },
[43] = &(utf_t){0b000001110b00001111, 0b111100000b11100000, 020000004000, 04177777 0177777, 4 },
[4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777, 3 },
&(utf_t){0},
};
 
Line 190 ⟶ 192:
{
int len = 0;
for(utf_t **u = utf; *u; ++u) {
int i = 1;
for(utf_t **u = utf; u; ++u) {
if((cp >= (*u)->beg) && (cp <= (*u)->end)) {
break;
Line 206 ⟶ 207:
{
int len = 0;
for(utf_t **u = utf; *u; ++u) {
if((ch & ~(*u)->mask) == (*u)->lead) {
break;
Line 223 ⟶ 224:
const int bytes = codepoint_len(cp);
 
int shift = utf[0]->bits_stored * (bytes - 1);
ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead;
for(int i = bytes - 1; i; --i, shift += 6) {
shift -= utf[0]->bits_stored;
for(int i = bytes - 1; i; --i,< shiftbytes; += 6+i) {
ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead;
shift -= utf[0]->bits_stored;
}
ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead;
ret[bytes] = '\0';
return ret;
Line 235 ⟶ 238:
{
int bytes = utf8_len(*chr);
int shift = 6utf[0]->bits_stored * (bytes - 1);
uint32_t codep = (*chr++ & utf[bytes]->mask) << shift;
 
for(int i = 1; i < bytes; ++i, ++chr) {
shift -= 6utf[0]->bits_stored;
codep |= ((char)*chr & utf[0]->mask) << shift;
}
Line 248 ⟶ 251:
int main(void)
{
const uint32_t *in, input[] = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};
 
printf("Character Unicode UTF-8 encoding (hex)\n");
Line 255 ⟶ 258:
char *utf8;
uint32_t codepoint;
for(in = input; *inputin; ++inputin) {
utf8 = to_utf8(*inputin);
codepoint = to_cp(utf8);
printf("%s U+%-7.4x", utf8, codepoint);
Anonymous user