UTF-8 encode and decode: Difference between revisions
Line 162: | Line 162: | ||
#include <stdio.h> |
#include <stdio.h> |
||
#include <stdlib.h> |
#include <stdlib.h> |
||
#include <inttypes.h> |
|||
typedef struct { |
|||
/* |
|||
char mask; /* the char data is in these bits */ |
|||
* I have used binary literal notation to highlight that these constants are masks |
|||
char lead; /* the start bytes of a utf-8 encoded char */ |
|||
* If your preferred compiler cannot parse binary literals, convert |
|||
uint32_t beg; /* beginning of codepoint range */ |
|||
* the const chars below to something your compiler will recognize |
|||
uint32_t end; /* end of codepoint range */ |
|||
*/ |
|||
}utf_t; |
|||
const char utf_1_glyph_bits = 0b01111111; |
|||
const char utf_glyph_bits = 0b00111111; |
|||
const char utf_2_glyph_bits = 0b00011111; |
|||
const char utf_3_glyph_bits = 0b00001111; |
|||
const char utf_4_glyph_bits = 0b00000111; |
|||
utf_t * utf[] = { |
|||
const char utf_marker = 0b10000000; |
|||
[0] = &(utf_t){0b00111111, 0b10000000, 0, 0 }, |
|||
const char utf_2byte_marker = 0b11000000; |
|||
[1] = &(utf_t){0b01111111, 0b00000000, 0000, 0177 }, |
|||
const char utf_3byte_marker = 0b11100000; |
|||
[2] = &(utf_t){0b00011111, 0b11000000, 0200, 03777 }, |
|||
const char utf_4byte_marker = 0b11110000; |
|||
[3] = &(utf_t){0b00001111, 0b11100000, 04000, 0177777 }, |
|||
[4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777}, |
|||
&(utf_t){0}, |
|||
}; |
|||
/* All lengths are in bytes */ |
|||
int codepoint_len(const long cp); |
|||
int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */ |
|||
int utf8_len(const char ch); |
|||
int utf8_len(const char ch); /* len of utf-8 encoded char */ |
|||
char * |
char *to_utf8(const uint32_t cp); |
||
uint32_t to_cp(const char chr[4]); |
|||
int codepoint_len(const |
int codepoint_len(const uint32_t cp) |
||
{ |
{ |
||
int len; |
int len = 0; |
||
int i = 1; |
|||
if((cp >= 0000) && (cp <= 0177)) { |
|||
for(utf_t **u = utf; u; ++u) { |
|||
len = 1; |
|||
if((cp >= (*u)->beg) && (cp <= (*u)->end)) { |
|||
break; |
|||
} |
|||
} else if((cp >= 04000) && (cp <= 0177777)) { |
|||
len |
++len; |
||
} |
|||
} else if((cp >= 0200000) && (cp <= 04177777)) { |
|||
if(len > 4) /* Out of bounds */ |
|||
len = 4; |
|||
exit(1); |
|||
} else { /* Out of bounds */ |
|||
return len; |
|||
} |
|||
int utf8_len(const char ch) |
|||
{ |
|||
int len = 0; |
|||
for(utf_t **u = utf; u; ++u) { |
|||
if((ch & ~(*u)->mask) == (*u)->lead) { |
|||
break; |
|||
} |
|||
++len; |
|||
} |
|||
if(len > 4) { /* Malformed leading byte */ |
|||
exit(1); |
exit(1); |
||
} |
} |
||
Line 202: | Line 218: | ||
} |
} |
||
char * |
char *to_utf8(const uint32_t cp) |
||
{ |
{ |
||
static char ret[5]; |
static char ret[5]; |
||
const int bytes = codepoint_len( |
const int bytes = codepoint_len(cp); |
||
int shift = 0; |
|||
switch(bytes) { |
|||
for(int i = bytes - 1; i; --i, shift += 6) { |
|||
case 1: |
|||
ret[ |
ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead; |
||
break; |
|||
case 2: |
|||
ret[0] = (point >> 6 & utf_2_glyph_bits) | utf_2byte_marker; |
|||
ret[1] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
case 3: |
|||
ret[0] = (point >> 12 & utf_3_glyph_bits) | utf_3byte_marker; |
|||
ret[1] = (point >> 6 & utf_glyph_bits) | utf_marker; |
|||
ret[2] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
case 4: |
|||
ret[0] = (point >> 18 & utf_4_glyph_bits) | utf_4byte_marker; |
|||
ret[1] = (point >> 12 & utf_glyph_bits) | utf_marker; |
|||
ret[2] = (point >> 6 & utf_glyph_bits) | utf_marker; |
|||
ret[3] = (point & utf_glyph_bits) | utf_marker; |
|||
break; |
|||
default: |
|||
exit(1); /* Unreachable */ |
|||
} |
} |
||
ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead; |
|||
ret[bytes] = '\0'; |
ret[bytes] = '\0'; |
||
return ret; |
return ret; |
||
} |
} |
||
uint32_t to_cp(const char chr[4]) |
|||
{ |
{ |
||
int |
int bytes = utf8_len(*chr); |
||
int shift = 6 * (bytes - 1); |
|||
if((ch & ~utf_1_glyph_bits) == 0) { |
|||
uint32_t codep = (*chr++ & utf[bytes]->mask) << shift; |
|||
len = 1; |
|||
} else if((ch & ~utf_2_glyph_bits) == utf_2byte_marker) { |
|||
len = 2; |
|||
} else if((ch & ~utf_3_glyph_bits) == utf_3byte_marker) { |
|||
len = 3; |
|||
} else if((ch & ~utf_4_glyph_bits) == utf_4byte_marker) { |
|||
len = 4; |
|||
} else { /* Malformed leading byte */ |
|||
exit(1); |
|||
} |
|||
return len; |
|||
} |
|||
for(int i = 1; i < bytes; ++i, ++chr) { |
|||
long utf82codepoint(const char chr[4]) |
|||
shift -= 6; |
|||
{ |
|||
codep |= ((char)*chr & utf[0]->mask) << shift; |
|||
signed long int codep; |
|||
switch(bytes) { |
|||
case 1: |
|||
codep = chr[0] & utf_1_glyph_bits; |
|||
break; |
|||
case 2: |
|||
codep = (chr[0] & utf_2_glyph_bits) << 6; |
|||
codep |= (chr[1] & utf_glyph_bits); |
|||
break; |
|||
case 3: |
|||
codep = (chr[0] & utf_3_glyph_bits) << 12; |
|||
codep |= (chr[1] & utf_glyph_bits ) << 6; |
|||
codep |= (chr[2] & utf_glyph_bits ); |
|||
break; |
|||
case 4: |
|||
codep = (chr[0] & utf_4_glyph_bits) << 18; |
|||
codep |= (chr[1] & utf_glyph_bits ) << 12; |
|||
codep |= (chr[2] & utf_glyph_bits ) << 6; |
|||
codep |= (chr[3] & utf_glyph_bits ); |
|||
break; |
|||
default: |
|||
exit(1); |
|||
} |
} |
||
Line 282: | Line 248: | ||
int main(void) |
int main(void) |
||
{ |
{ |
||
const |
const uint32_t *input = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0}; |
||
printf("Character Unicode UTF-8 encoding (hex)\n"); |
printf("Character Unicode UTF-8 encoding (hex)\n"); |
||
Line 288: | Line 254: | ||
char *utf8; |
char *utf8; |
||
uint32_t codepoint; |
|||
for( |
for(; *input; ++input) { |
||
utf8 = |
utf8 = to_utf8(*input); |
||
codepoint = |
codepoint = to_cp(utf8); |
||
printf("%s U+%-7. |
printf("%s U+%-7.4x", utf8, codepoint); |
||
for(int i = 0; utf8[i] && i < 4; ++i) { |
for(int i = 0; utf8[i] && i < 4; ++i) { |
||
Line 311: | Line 277: | ||
€ U+20ac e2 82 ac |
€ U+20ac e2 82 ac |
||
𝄞 U+1d11e f0 9d 84 9e |
𝄞 U+1d11e f0 9d 84 9e |
||
</lang> |
</lang> |
||
Revision as of 10:57, 21 December 2017
You are encouraged to solve this task according to the task description, using any language you may know.
As described in UTF-8 and in Wikipedia, UTF-8 is a popular encoding of (multi-byte) Unicode code-points into eight-bit octets.
The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-4 bytes representing that character in the UTF-8 encoding.
Then you have to write the corresponding decoder that takes a sequence of 1-4 UTF-8 encoded bytes and return the corresponding unicode character.
Demonstrate the functionality of your encoder and decoder on the following five characters:
Character Name Unicode UTF-8 encoding (hex) --------------------------------------------------------------------------------- A LATIN CAPITAL LETTER A U+0041 41 ö LATIN SMALL LETTER O WITH DIAERESIS U+00F6 C3 B6 Ж CYRILLIC CAPITAL LETTER ZHE U+0416 D0 96 € EURO SIGN U+20AC E2 82 AC 𝄞 MUSICAL SYMBOL G CLEF U+1D11E F0 9D 84 9E
Provided below is a reference implementation in Common Lisp.
Common Lisp
Helper functions
<lang lisp> (defun ascii-byte-p (octet)
"Return t if octet is a single-byte 7-bit ASCII char. The most significant bit is 0, so the allowed pattern is 0xxx xxxx." (assert (typep octet 'integer)) (assert (<= (integer-length octet) 8)) (let ((bitmask #b10000000) (template #b00000000)) ;; bitwise and the with the bitmask #b11000000 to extract the first two bits. ;; check if the first two bits are equal to the template #b10000000. (= (logand bitmask octet) template)))
(defun multi-byte-p (octet)
"Return t if octet is a part of a multi-byte UTF-8 sequence. The multibyte pattern is 1xxx xxxx. A multi-byte can be either a lead byte or a trail byte." (assert (typep octet 'integer)) (assert (<= (integer-length octet) 8)) (let ((bitmask #b10000000) (template #b10000000)) ;; bitwise and the with the bitmask #b11000000 to extract the first two bits. ;; check if the first two bits are equal to the template #b10000000. (= (logand bitmask octet) template)))
(defun lead-byte-p (octet)
"Return t if octet is one of the leading bytes of an UTF-8 sequence, nil otherwise. Allowed leading byte patterns are 0xxx xxxx, 110x xxxx, 1110 xxxx and 1111 0xxx." (assert (typep octet 'integer)) (assert (<= (integer-length octet) 8)) (let ((bitmasks (list #b10000000 #b11100000 #b11110000 #b11111000)) (templates (list #b00000000 #b11000000 #b11100000 #b11110000))) (some #'(lambda (a b) (= (logand a octet) b)) bitmasks templates)))
(defun n-trail-bytes (octet)
"Take a leading utf-8 byte, return the number of continuation bytes 1-3." (assert (typep octet 'integer)) (assert (<= (integer-length octet) 8)) (let ((bitmasks (list #b10000000 #b11100000 #b11110000 #b11111000)) (templates (list #b00000000 #b11000000 #b11100000 #b11110000))) (loop for i from 0 to 3 when (= (nth i templates) (logand (nth i bitmasks) octet)) return i)))
</lang>
Encoder
<lang lisp> (defun unicode-to-utf-8 (int)
"Take a unicode code point, return a list of one to four UTF-8 encoded bytes (octets)." (assert (<= (integer-length int) 21)) (let ((n-trail-bytes (cond ((<= #x00000 int #x00007F) 0) ((<= #x00080 int #x0007FF) 1) ((<= #x00800 int #x00FFFF) 2) ((<= #x10000 int #x10FFFF) 3))) (lead-templates (list #b00000000 #b11000000 #b11100000 #b11110000)) (trail-template #b10000000) ;; number of content bits in the lead byte. (n-lead-bits (list 7 5 4 3)) ;; number of content bits in the trail byte. (n-trail-bits 6) ;; list to put the UTF-8 encoded bytes in. (byte-list nil)) (if (= n-trail-bytes 0) ;; if we need 0 trail bytes, ist just an ascii single byte. (push int byte-list) (progn ;; if we need more than one byte, first fill the trail bytes with 6 bits each. (loop for i from 0 to (1- n-trail-bytes) do (push (+ trail-template (ldb (byte n-trail-bits (* i n-trail-bits)) int)) byte-list)) ;; then copy the remaining content bytes to the lead byte. (push (+ (nth n-trail-bytes lead-templates) (ldb (byte (nth n-trail-bytes n-lead-bits) (* n-trail-bytes n-trail-bits)) int)) byte-list))) ;; return the list of UTF-8 encoded bytes. byte-list))
</lang>
Decoder
<lang lisp> (defun utf-8-to-unicode (byte-list)
"Take a list of one to four utf-8 encoded bytes (octets), return a code point." (let ((b1 (car byte-list))) (cond ((ascii-byte-p b1) b1) ; if a single byte, just return it. ((multi-byte-p b1) (if (lead-byte-p b1) (let ((n (n-trail-bytes b1)) ;; Content bits we want to extract from each lead byte. (lead-templates (list #b01111111 #b00011111 #b00001111 #b00000111)) ;; Content bits we want to extract from each trail byte. (trail-template #b00111111)) (if (= n (1- (list-length byte-list))) ;; add lead byte (+ (ash (logand (nth 0 byte-list) (nth n lead-templates)) (* 6 n)) ;; and the trail bytes (loop for i from 1 to n sum (ash (logand (nth i byte-list) trail-template) (* 6 (- n i))))) (error "calculated number of bytes doesnt match the length of the byte list"))) (error "first byte in the list isnt a lead byte"))))))
</lang>
The test
<lang lisp> (defun test-utf-8 ()
"Return t if the chosen unicode points are encoded and decoded correctly." (let* ((unicodes-orig (list 65 246 1046 8364 119070)) (unicodes-test (mapcar #'(lambda (x) (utf-8-to-unicode (unicode-to-utf-8 x))) unicodes-orig))) (mapcar #'(lambda (x) (format t "character ~A, code point: ~6x, utf-8: ~{~x ~}~%" (code-char x) x (unicode-to-utf-8 x))) unicodes-orig) ;; return t if all are t (every #'= unicodes-orig unicodes-test)))
</lang>
Test output
<lang lisp> CL-USER> (test-utf-8) character A, code point: 41, utf-8: 41 character ö, code point: F6, utf-8: C3 B6 character Ж, code point: 416, utf-8: D0 96 character €, code point: 20AC, utf-8: E2 82 AC character 𝄞, code point: 1D11E, utf-8: F0 9D 84 9E T </lang>
C
<lang C>
- include <stdio.h>
- include <stdlib.h>
- include <inttypes.h>
typedef struct { char mask; /* the char data is in these bits */ char lead; /* the start bytes of a utf-8 encoded char */ uint32_t beg; /* beginning of codepoint range */ uint32_t end; /* end of codepoint range */ }utf_t;
utf_t * utf[] = { [0] = &(utf_t){0b00111111, 0b10000000, 0, 0 }, [1] = &(utf_t){0b01111111, 0b00000000, 0000, 0177 }, [2] = &(utf_t){0b00011111, 0b11000000, 0200, 03777 }, [3] = &(utf_t){0b00001111, 0b11100000, 04000, 0177777 }, [4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777}, &(utf_t){0}, };
/* All lengths are in bytes */ int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */ int utf8_len(const char ch); /* len of utf-8 encoded char */
char *to_utf8(const uint32_t cp); uint32_t to_cp(const char chr[4]);
int codepoint_len(const uint32_t cp) { int len = 0; int i = 1; for(utf_t **u = utf; u; ++u) { if((cp >= (*u)->beg) && (cp <= (*u)->end)) { break; } ++len; } if(len > 4) /* Out of bounds */ exit(1);
return len; }
int utf8_len(const char ch) { int len = 0; for(utf_t **u = utf; u; ++u) { if((ch & ~(*u)->mask) == (*u)->lead) { break; } ++len; } if(len > 4) { /* Malformed leading byte */ exit(1); } return len; }
char *to_utf8(const uint32_t cp) { static char ret[5]; const int bytes = codepoint_len(cp);
int shift = 0; for(int i = bytes - 1; i; --i, shift += 6) { ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead; } ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead; ret[bytes] = '\0'; return ret; }
uint32_t to_cp(const char chr[4]) { int bytes = utf8_len(*chr); int shift = 6 * (bytes - 1); uint32_t codep = (*chr++ & utf[bytes]->mask) << shift;
for(int i = 1; i < bytes; ++i, ++chr) { shift -= 6; codep |= ((char)*chr & utf[0]->mask) << shift; }
return codep; }
int main(void) { const uint32_t *input = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};
printf("Character Unicode UTF-8 encoding (hex)\n"); printf("----------------------------------------\n");
char *utf8; uint32_t codepoint; for(; *input; ++input) { utf8 = to_utf8(*input); codepoint = to_cp(utf8); printf("%s U+%-7.4x", utf8, codepoint);
for(int i = 0; utf8[i] && i < 4; ++i) { printf("%hhx ", utf8[i]); } printf("\n"); } return 0; } </lang> Output <lang> Character Unicode UTF-8 encoding (hex)
A U+0041 41 ö U+00f6 c3 b6 Ж U+0416 d0 96 € U+20ac e2 82 ac 𝄞 U+1d11e f0 9d 84 9e
</lang>
D
<lang D>import std.conv; import std.stdio;
immutable CHARS = ["A","ö","Ж","€","𝄞"];
void main() {
writeln("Character Code-Point Code-Units"); foreach (c; CHARS) { auto bytes = cast(ubyte[]) c; //The raw bytes of a character can be accessed by casting auto unicode = cast(uint) to!dstring(c)[0]; //Convert from a UTF8 string to a UTF32 string, and cast the first character to a number writefln("%s %7X [%(%X, %)]", c, unicode, bytes); }
}</lang>
- Output:
Character Code-Point Code-Units A 41 [41] ö F6 [C3, B6] Ж 416 [D0, 96] € 20AC [E2, 82, AC] 𝄞 1D11E [F0, 9D, 84, 9E]
Go
Implementation
This implementation is missing all checks for invalid data and so is not production-ready, but illustrates the basic UTF-8 encoding scheme. <lang go>package main
import (
"bytes" "encoding/hex" "fmt" "log" "strings"
)
var testCases = []struct {
rune string
}{
{'A', "41"}, {'ö', "C3 B6"}, {'Ж', "D0 96"}, {'€', "E2 82 AC"}, {'𝄞', "F0 9D 84 9E"},
}
func main() {
for _, tc := range testCases { // derive some things from test data u := fmt.Sprintf("U+%04X", tc.rune) b, err := hex.DecodeString(strings.Replace(tc.string, " ", "", -1)) if err != nil { log.Fatal("bad test data") } // exercise encoder and decoder on test data e := encodeUTF8(tc.rune) d := decodeUTF8(b) // show function return values fmt.Printf("%c %-7s %X\n", d, u, e) // validate return values against test data if !bytes.Equal(e, b) { log.Fatal("encodeUTF8 wrong") } if d != tc.rune { log.Fatal("decodeUTF8 wrong") } }
}
const (
// first byte of a 2-byte encoding starts 110 and carries 5 bits of data b2Lead = 0xC0 // 1100 0000 b2Mask = 0x1F // 0001 1111
// first byte of a 3-byte encoding starts 1110 and carries 4 bits of data b3Lead = 0xE0 // 1110 0000 b3Mask = 0x0F // 0000 1111
// first byte of a 4-byte encoding starts 11110 and carries 3 bits of data b4Lead = 0xF0 // 1111 0000 b4Mask = 0x07 // 0000 0111
// non-first bytes start 10 and carry 6 bits of data mbLead = 0x80 // 1000 0000 mbMask = 0x3F // 0011 1111
)
func encodeUTF8(r rune) []byte {
switch i := uint32(r); { case i <= 1<<7-1: // max code point that encodes into a single byte return []byte{byte(r)} case i <= 1<<11-1: // into two bytes return []byte{ b2Lead | byte(r>>6), mbLead | byte(r)&mbMask} case i <= 1<<16-1: // three return []byte{ b3Lead | byte(r>>12), mbLead | byte(r>>6)&mbMask, mbLead | byte(r)&mbMask} default: return []byte{ b4Lead | byte(r>>18), mbLead | byte(r>>12)&mbMask, mbLead | byte(r>>6)&mbMask, mbLead | byte(r)&mbMask} }
}
func decodeUTF8(b []byte) rune {
switch b0 := b[0]; { case b0 < 0x80: return rune(b0) case b0 < 0xE0: return rune(b0&b2Mask)<<6 | rune(b[1]&mbMask) case b0 < 0xF0: return rune(b0&b3Mask)<<12 | rune(b[1]&mbMask)<<6 | rune(b[2]&mbMask) default: return rune(b0&b4Mask)<<18 | rune(b[1]&mbMask)<<12 | rune(b[2]&mbMask)<<6 | rune(b[3]&mbMask) }
}</lang>
- Output:
A U+0041 41 ö U+00F6 C3B6 Ж U+0416 D096 € U+20AC E282AC 𝄞 U+1D11E F09D849E
Library/language
<lang go>package main
import (
"fmt" "unicode/utf8"
)
func utf8encode(codepoint rune) []byte {
buffer := make([]byte, 4) length := utf8.EncodeRune(buffer, codepoint) return buffer[:length]
}
func utf8decode(bytes []byte) rune {
result, _ := utf8.DecodeRune(bytes) return result
}
func main() {
fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded"); for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} { encoded := utf8encode(codepoint) decoded := utf8decode(encoded) fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded) }
}</lang>
- Output:
Char Unicode UTF-8 encoded Decoded A U+0041 41 A ö U+00F6 C3B6 ö Ж U+0416 D096 Ж € U+20AC E282AC € 𝄞 U+1D11E F09D849E 𝄞
Alternately: <lang go>package main
import (
"fmt"
)
func utf8encode(codepoint rune) []byte {
return []byte(string([]rune{codepoint}))
}
func utf8decode(bytes []byte) rune {
return []rune(string(bytes))[0]
}
func main() {
fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded"); for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} { encoded := utf8encode(codepoint) decoded := utf8decode(encoded) fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded) }
}</lang>
- Output:
Char Unicode UTF-8 encoded Decoded A U+0041 41 A ö U+00F6 C3B6 ö Ж U+0416 D096 Ж € U+20AC E282AC € 𝄞 U+1D11E F09D849E 𝄞
Java
<lang java>import java.nio.charset.StandardCharsets; import java.util.Formatter;
public class UTF8EncodeDecode {
public static byte[] utf8encode(int codepoint) { return new String(new int[]{codepoint}, 0, 1).getBytes(StandardCharsets.UTF_8); }
public static int utf8decode(byte[] bytes) { return new String(bytes, StandardCharsets.UTF_8).codePointAt(0); }
public static void main(String[] args) { System.out.printf("%-7s %-43s %7s\t%s\t%7s%n", "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded");
for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) { byte[] encoded = utf8encode(codepoint); Formatter formatter = new Formatter(); for (byte b : encoded) { formatter.format("%02X ", b); } String encodedHex = formatter.toString(); int decoded = utf8decode(encoded); System.out.printf("%-7c %-43s U+%04X\t%-12s\tU+%04X%n", codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded); } }
}</lang>
- Output:
Char Name Unicode UTF-8 encoded Decoded A LATIN CAPITAL LETTER A U+0041 41 A ö LATIN SMALL LETTER O WITH DIAERESIS U+00F6 C3 B6 ö Ж CYRILLIC CAPITAL LETTER ZHE U+0416 D0 96 Ж € EURO SIGN U+20AC E2 82 AC € 𝄞 MUSICAL SYMBOL G CLEF U+1D11E F0 9D 84 9E 𝄞
JavaScript
An implementation in ECMAScript 2015 (ES6): <lang javascript> /***************************************************************************\ |* Pure UTF-8 handling without detailed error reporting functionality. *| |***************************************************************************| |* utf8encode *| |* < String character or UInt32 code point *| |* > Uint8Array encoded_character *| |* | ErrorString *| |* *| |* utf8encode takes a string or uint32 representing a single code point *| |* as its argument and returns an array of length 1 up to 4 containing *| |* utf8 code units representing that character. *| |***************************************************************************| |* utf8decode *| |* < Unit8Array [highendbyte highmidendbyte lowmidendbyte lowendbyte] *| |* > uint32 character *| |* | ErrorString *| |* *| |* utf8decode takes an array of one to four uint8 representing utf8 code *| |* units and returns a uint32 representing that code point. *| \***************************************************************************/
const
utf8encode= n=> (m=> m<0x80 ?Uint8Array.from( [ m>>0&0x7f|0x00]) :m<0x800 ?Uint8Array.from( [ m>>6&0x1f|0xc0,m>>0&0x3f|0x80]) :m<0x10000 ?Uint8Array.from( [ m>>12&0x0f|0xe0,m>>6&0x3f|0x80,m>>0&0x3f|0x80]) :m<0x110000 ?Uint8Array.from( [ m>>18&0x07|0xf0,m>>12&0x3f|0x80,m>>6&0x3f|0x80,m>>0&0x3f|0x80]) :(()=>{throw'Invalid Unicode Code Point!'})()) ( typeof n==='string' ?n.codePointAt(0) :n&0x1fffff), utf8decode= ([m,n,o,p])=> m<0x80 ?( m&0x7f)<<0 :0xc1<m&&m<0xe0&&n===(n&0xbf) ?( m&0x1f)<<6|( n&0x3f)<<0 :( m===0xe0&&0x9f<n&&n<0xc0 ||0xe0<m&&m<0xed&&0x7f<n&&n<0xc0 ||m===0xed&&0x7f<n&&n<0xa0 ||0xed<m&&m<0xf0&&0x7f<n&&n<0xc0) &&o===o&0xbf ?( m&0x0f)<<12|( n&0x3f)<<6|( o&0x3f)<<0 :( m===0xf0&&0x8f<n&&n<0xc0 ||m===0xf4&&0x7f<n&&n<0x90 ||0xf0<m&&m<0xf4&&0x7f<n&&n<0xc0) &&o===o&0xbf&&p===p&0xbf ?( m&0x07)<<18|( n&0x3f)<<12|( o&0x3f)<<6|( p&0x3f)<<0 :(()=>{throw'Invalid UTF-8 encoding!'})()
</lang> The testing inputs: <lang javascript> const
str= 'AöЖ€𝄞' ,cps= Uint32Array.from(str,s=>s.codePointAt(0)) ,cus= [ [ 0x41] ,[ 0xc3,0xb6] ,[ 0xd0,0x96] ,[ 0xe2,0x82,0xac] ,[ 0xf0,0x9d,0x84,0x9e]] .map(a=>Uint8Array.from(a)) ,zip3= ([a,...as],[b,...bs],[c,...cs])=> 0<as.length+bs.length+cs.length ?[ [ a,b,c],...zip3(as,bs,cs)] :[ [ a,b,c]] ,inputs=zip3(str,cps,cus);
</lang> The testing code: <lang javascript> console.log(`\ ${'Character'.padEnd(16)}\ ${'CodePoint'.padEnd(16)}\ ${'CodeUnits'.padEnd(16)}\ ${'uft8encode(ch)'.padEnd(16)}\ ${'uft8encode(cp)'.padEnd(16)}\ utf8decode(cu)`) for(let [ch,cp,cu] of inputs)
console.log(`\
${ch.padEnd(16)}\ ${cp.toString(0x10).padStart(8,'U+000000').padEnd(16)}\ ${`[${[...cu].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${`[${[...utf8encode(ch)].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${`[${[...utf8encode(cp)].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${utf8decode(cu).toString(0x10).padStart(8,'U+000000')}`) </lang> and finally, the output from the test:
Character CodePoint CodeUnits uft8encode(ch) uft8encode(cp) utf8decode(cu) A U+000041 [41] [41] [41] U+000041 ö U+0000f6 [c3,b6] [c3,b6] [c3,b6] U+0000f6 Ж U+000416 [d0,96] [d0,96] [d0,96] U+000416 € U+0020ac [e2,82,ac] [e2,82,ac] [e2,82,ac] U+0020ac 𝄞 U+01d11e [f0,9d,84,9e] [f0,9d,84,9e] [f0,9d,84,9e] U+01d11e
Note that the misalign there on the last line is caused by the string length of astral characters being 2 so the padding functions break.
Kotlin
<lang scala>// version 1.1.2
fun utf8Encode(codePoint: Int) = String(intArrayOf(codePoint), 0, 1).toByteArray(Charsets.UTF_8)
fun utf8Decode(bytes: ByteArray) = String(bytes, Charsets.UTF_8).codePointAt(0)
fun main(args: Array<String>) {
val codePoints = intArrayOf(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E) println("Char Name Unicode UTF-8 Decoded") for (codePoint in codePoints) { var n = if(codePoint <= 0xFFFF) 4 else 5 System.out.printf("%-${n}c %-35s U+%05X ", codePoint, Character.getName(codePoint), codePoint) val bytes = utf8Encode(codePoint) var s = "" for (byte in bytes) s += "%02X ".format(byte) val decoded = utf8Decode(bytes) n = if(decoded.toInt() <= 0xFFFF) 12 else 11 System.out.printf("%-${n}s %c\n", s, decoded) }
}</lang>
- Output:
Char Name Unicode UTF-8 Decoded A LATIN CAPITAL LETTER A U+00041 41 A ö LATIN SMALL LETTER O WITH DIAERESIS U+000F6 C3 B6 ö Ж CYRILLIC CAPITAL LETTER ZHE U+00416 D0 96 Ж € EURO SIGN U+020AC E2 82 AC € 𝄞 MUSICAL SYMBOL G CLEF U+1D11E F0 9D 84 9E 𝄞
Mathematica
<lang Mathematica>utf = ToCharacterCode[ToString["AöЖ€", CharacterEncoding -> "UTF8"]] ToCharacterCode[FromCharacterCode[utf, "UTF8"]]</lang> {{out}
{65, 195, 182, 208, 150, 226, 130, 172} {65, 246, 1046, 8364}
Perl 6
Pretty much all built in to the language. <lang perl6>say sprintf("%-18s %-34s %7s %7s\t%s %s\n", 'Character', 'Name', 'Ordinal', 'Unicode', 'UTF-8 encoded', 'decoded'), '-' x 94;
for < A ö Ж € 𝄞 😜 > -> $char {
printf " %-7s %-43s %6s U+%04s\t%12s %4s\n", $char, $char.uniname, $char.ord, $char.ord.base(16), $char.encode('UTF8').list».base(16).Str, $char.encode('UTF8').decode;
}</lang>
- Output:
Character Name Ordinal Unicode UTF-8 encoded decoded ---------------------------------------------------------------------------------------------- A LATIN CAPITAL LETTER A 65 U+0041 41 A ö LATIN SMALL LETTER O WITH DIAERESIS 246 U+00F6 C3 B6 ö Ж CYRILLIC CAPITAL LETTER ZHE 1046 U+0416 D0 96 Ж € EURO SIGN 8364 U+20AC E2 82 AC € 𝄞 MUSICAL SYMBOL G CLEF 119070 U+1D11E F0 9D 84 9E 𝄞 😜 FACE WITH STUCK-OUT TONGUE AND WINKING EYE 128540 U+1F61C F0 9F 98 9C 😜
Phix
Standard autoinclude, see the manual and/or builtins/utfconv.e
( http://phix.x10.mx/docs/html/utfconv.htm and/or https://bitbucket.org/petelomax/phix/src )
As requested in the task description:
<lang Phix>constant tests = {#0041, #00F6, #0416, #20AC, #1D11E}
function hex(sequence s, string fmt) -- output helper
for i=1 to length(s) do s[i] = sprintf(fmt,s[i]) end for return join(s,',')
end function
for i=1 to length(tests) do
integer codepoint = tests[i] sequence s = utf32_to_utf8({codepoint}), r = utf8_to_utf32(s) printf(1,"#%04x -> {%s} -> {%s}\n",{codepoint, hex(s,"#%02x"),hex(r,"#%04x")})
end for</lang>
- Output:
#0041 -> {#41} -> {#0041} #00F6 -> {#C3,#B6} -> {#00F6} #0416 -> {#D0,#96} -> {#0416} #20AC -> {#E2,#82,#AC} -> {#20AC} #1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E}
Python
<lang python>
- !/usr/bin/env python3
from unicodedata import name
def unicode_code(ch):
return 'U+{:04x}'.format(ord(ch))
def utf8hex(ch):
return " ".join([hex(c)[2:] for c in ch.encode('utf8')]).upper()
if __name__ == "__main__":
print('{:<11} {:<36} {:<15} {:<15}'.format('Character', 'Name', 'Unicode', 'UTF-8 encoding (hex)')) chars = ['A', 'ö', 'Ж', '€', '𝄞'] for char in chars: print('{:<11} {:<36} {:<15} {:<15}'.format(char, name(char), unicode_code(char), utf8hex(char)))</lang>
- Output:
Character Name Unicode UTF-8 encoding (hex) A LATIN CAPITAL LETTER A U+0041 41 ö LATIN SMALL LETTER O WITH DIAERESIS U+00f6 C3 B6 Ж CYRILLIC CAPITAL LETTER ZHE U+0416 D0 96 € EURO SIGN U+20ac E2 82 AC 𝄞 MUSICAL SYMBOL G CLEF U+1d11e F0 9D 84 9E
Racket
<lang racket>#lang racket
(define char-map
'((LATIN-CAPITAL-LETTER-A . #\U0041) (LATIN-SMALL-LETTER-O-WITH-DIAERESIS . #\U00F6) (CYRILLIC-CAPITAL-LETTER-ZHE . #\U0416) (EURO-SIGN . #\U20AC) (MUSICAL-SYMBOL-G-CLEF . #\U1D11E)))
(for ((name.char (in-list char-map)))
(define name (car name.char)) (define chr (cdr name.char)) (let ((bites (bytes->list (string->bytes/utf-8 (list->string (list chr)))))) (printf "~s\t~a\t~a\t~a\t~a~%" chr chr (map (curryr number->string 16) bites) (bytes->string/utf-8 (list->bytes bites)) name)))</lang>
- Output:
#\A A (41) A LATIN-CAPITAL-LETTER-A #\ö ö (c3 b6) ö LATIN-SMALL-LETTER-O-WITH-DIAERESIS #\Ж Ж (d0 96) Ж CYRILLIC-CAPITAL-LETTER-ZHE #\€ € (e2 82 ac) € EURO-SIGN #\𝄞 𝄞 (f0 9d 84 9e) 𝄞 MUSICAL-SYMBOL-G-CLEF
Sidef
<lang ruby>func utf8_encoder(Number code) {
code.chr.encode('UTF-8').bytes.map{.chr}
}
func utf8_decoder(Array bytes) {
bytes.map{.ord}.decode('UTF-8')
}
for n in ([0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E]) {
var encoded = utf8_encoder(n) var decoded = utf8_decoder(encoded) assert_eq(n, decoded.ord) say "#{decoded} -> #{encoded}"
}</lang>
- Output:
A -> ["A"] ö -> ["\xC3", "\xB6"] Ж -> ["\xD0", "\x96"] € -> ["\xE2", "\x82", "\xAC"] 𝄞 -> ["\xF0", "\x9D", "\x84", "\x9E"]
Tcl
Note: Tcl can handle Unicodes only up to U+FFFD, i.e. the Basic Multilingual Plane (BMP, 16 bits wide). Therefore, the fifth test fails as expected. <lang Tcl>proc encoder int {
set u [format %c $int] set bytes {} foreach byte [split [encoding convertto utf-8 $u] ""] { lappend bytes [format %02X [scan $byte %c]] } return $bytes
} proc decoder bytes {
set str {} foreach byte $bytes { append str [format %c [scan $byte %x]] } return [encoding convertfrom utf-8 $str]
} foreach test {0x0041 0x00f6 0x0416 0x20ac 0x1d11e} {
set res $test lappend res [encoder $test] -> [decoder [encoder $test]] puts $res
}</lang>
0x0041 41 -> A 0x00f6 {C3 B6} -> ö 0x0416 {D0 96} -> Ж 0x20ac {E2 82 AC} -> € 0x1d11e {EF BF BD} -> �
Alternative Implementation
While perhaps not as readable as the above, this version handles beyond-BMP codepoints by manually composing the utf-8 byte sequences and emitting raw bytes to the console. encoding convertto utf-8 command still does the heavy lifting where it can.
<lang Tcl>proc utf8 {codepoint} {
scan $codepoint %llx cp if {$cp < 0x10000} { set str [subst \\u$codepoint] ;# substitute per Tcl backslash rule set bytes [encoding convertto utf-8 $str] ;# encode } else { ;# codepoints beyond the BMP need manual approach set bits [format %021b $cp] ;# format as binary string set unibits 11110[string range $bits 0 2];# insert extra bits for utf-8 4-byte encoding append unibits 10[string range $bits 3 8] append unibits 10[string range $bits 9 14] append unibits 10[string range $bits 15 20] set bytes [binary format B* $unibits] ;# turn into a sequence of bytes } return $bytes
}
proc hexchars {s} {
binary scan $s H* hex regsub -all .. $hex {\0 }
}
- for the test, we assume the tty is in utf-8 mode and can handle beyond-BMP chars
- so set output mode to binary so we can write raw bytes!
chan configure stdout -encoding binary foreach codepoint { 41 F6 416 20AC 1D11E } {
set utf8 [utf8 $codepoint] puts "[format U+%04s $codepoint]\t$utf8\t[hexchars $utf8]"
}</lang>
- Output:
U+0041 A 41U+00F6 ö c3 b6 U+0416 Ж d0 96 U+20AC € e2 82 ac U+1D11E 𝄞 f0 9d 84 9e
zkl
<lang zkl>println("Char Unicode UTF-8"); foreach utf,unicode_int in (T( T("\U41;",0x41), T("\Uf6;",0xf6),
T("\U416;",0x416), T("\U20AC;",0x20ac), T("\U1D11E;",0x1d11e))){ utf_int:=utf.reduce(fcn(s,c){ 0x100*s + c.toAsc() },0); char :=unicode_int.toString(-8); // Unicode int to UTF-8 string // UTF-8 bytes to UTF-8 string: char2:=Data(Void,utf_int.toBigEndian(utf_int.len())).text;
println("%s %s %9s %x".fmt(char,char2,"U+%x".fmt(unicode_int),utf_int));
}</lang> Int.len() --> number of bytes in int. This could be hard coded because UTF-8 has a max of 6 bytes and (0x41).toBigEndian(6) --> 0x41,0,0,0,0,0 which is a zero terminated string ("A");
- Output:
Char Unicode UTF-8 A A U+41 41 ö ö U+f6 c3b6 Ж Ж U+416 d096 € € U+20ac e282ac 𝄞 𝄞 U+1d11e f09d849e