Anonymous user
UTF-8 encode and decode: Difference between revisions
→{{header|C}}: rework to_utf8(), remove magic nums, and better main() loop
(→{{header|C}}: rework to_utf8(), remove magic nums, and better main() loop) |
|||
Line 165:
typedef struct {
char mask; /*
char lead; /*
uint32_t beg; /* beginning of codepoint range */
uint32_t end; /* end of codepoint range */
int bits_stored; /* the number of bits from the codepoint that fits in char */
}utf_t;
utf_t * utf[] = {
/* mask lead beg end bits */
[
[
[
[
[4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777, 3 },
};
Line 190 ⟶ 192:
{
int len = 0;
for(utf_t **u = utf; *u; ++u) {▼
▲ for(utf_t **u = utf; u; ++u) {
if((cp >= (*u)->beg) && (cp <= (*u)->end)) {
break;
Line 206 ⟶ 207:
{
int len = 0;
for(utf_t **u = utf; *u; ++u) {
if((ch & ~(*u)->mask) == (*u)->lead) {
break;
Line 223 ⟶ 224:
const int bytes = codepoint_len(cp);
int shift = utf[0]->bits_stored * (bytes - 1);
ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead;▼
for(int i = bytes - 1; i; --i, shift += 6) {▼
shift -= utf[0]->bits_stored;
ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead;
shift -= utf[0]->bits_stored;
}
▲ ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead;
ret[bytes] = '\0';
return ret;
Line 235 ⟶ 238:
{
int bytes = utf8_len(*chr);
int shift =
uint32_t codep = (*chr++ & utf[bytes]->mask) << shift;
for(int i = 1; i < bytes; ++i, ++chr) {
shift -=
codep |= ((char)*chr & utf[0]->mask) << shift;
}
Line 248 ⟶ 251:
int main(void)
{
const uint32_t *in, input[] =
printf("Character Unicode UTF-8 encoding (hex)\n");
Line 255 ⟶ 258:
char *utf8;
uint32_t codepoint;
for(in = input; *
utf8 = to_utf8(*
codepoint = to_cp(utf8);
printf("%s U+%-7.4x", utf8, codepoint);
|