UTF-8 encode and decode
As described in UTF-8 and in Wikipedia, UTF-8 is a popular encoding of (multi-byte) Unicode code-points into eight-bit octets.
You are encouraged to solve this task according to the task description, using any language you may know.
The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-4 bytes representing that character in the UTF-8 encoding.
Then you have to write the corresponding decoder that takes a sequence of 1-4 UTF-8 encoded bytes and return the corresponding unicode character.
Demonstrate the functionality of your encoder and decoder on the following five characters:
Character Name Unicode UTF-8 encoding (hex) --------------------------------------------------------------------------------- A LATIN CAPITAL LETTER A U+0041 41 ö LATIN SMALL LETTER O WITH DIAERESIS U+00F6 C3 B6 Ж CYRILLIC CAPITAL LETTER ZHE U+0416 D0 96 € EURO SIGN U+20AC E2 82 AC 𝄞 MUSICAL SYMBOL G CLEF U+1D11E F0 9D 84 9E
Provided below is a reference implementation in Common Lisp.
Common Lisp
Helper functions
<lang lisp> (defun ascii-byte-p (octet)
"Return t if octet is a single-byte 7-bit ASCII char. The most significant bit is 0, so the allowed pattern is 0xxx xxxx." (assert (typep octet 'integer)) (assert (<= (integer-length octet) 8)) (let ((bitmask #b10000000) (template #b00000000)) ;; bitwise and the with the bitmask #b11000000 to extract the first two bits. ;; check if the first two bits are equal to the template #b10000000. (= (logand bitmask octet) template)))
(defun multi-byte-p (octet)
"Return t if octet is a part of a multi-byte UTF-8 sequence. The multibyte pattern is 1xxx xxxx. A multi-byte can be either a lead byte or a trail byte." (assert (typep octet 'integer)) (assert (<= (integer-length octet) 8)) (let ((bitmask #b10000000) (template #b10000000)) ;; bitwise and the with the bitmask #b11000000 to extract the first two bits. ;; check if the first two bits are equal to the template #b10000000. (= (logand bitmask octet) template)))
(defun lead-byte-p (octet)
"Return t if octet is one of the leading bytes of an UTF-8 sequence, nil otherwise. Allowed leading byte patterns are 0xxx xxxx, 110x xxxx, 1110 xxxx and 1111 0xxx." (assert (typep octet 'integer)) (assert (<= (integer-length octet) 8)) (let ((bitmasks (list #b10000000 #b11100000 #b11110000 #b11111000)) (templates (list #b00000000 #b11000000 #b11100000 #b11110000))) (some #'(lambda (a b) (= (logand a octet) b)) bitmasks templates)))
(defun n-trail-bytes (octet)
"Take a leading utf-8 byte, return the number of continuation bytes 1-3." (assert (typep octet 'integer)) (assert (<= (integer-length octet) 8)) (let ((bitmasks (list #b10000000 #b11100000 #b11110000 #b11111000)) (templates (list #b00000000 #b11000000 #b11100000 #b11110000))) (loop for i from 0 to 3 when (= (nth i templates) (logand (nth i bitmasks) octet)) return i)))
</lang>
Encoder
<lang lisp> (defun unicode-to-utf-8 (int)
"Take a unicode code point, return a list of one to four UTF-8 encoded bytes (octets)." (assert (<= (integer-length int) 21)) (let ((n-trail-bytes (cond ((<= #x00000 int #x00007F) 0) ((<= #x00080 int #x0007FF) 1) ((<= #x00800 int #x00FFFF) 2) ((<= #x10000 int #x10FFFF) 3))) (lead-templates (list #b00000000 #b11000000 #b11100000 #b11110000)) (trail-template #b10000000) ;; number of content bits in the lead byte. (n-lead-bits (list 7 5 4 3)) ;; number of content bits in the trail byte. (n-trail-bits 6) ;; list to put the UTF-8 encoded bytes in. (byte-list nil)) (if (= n-trail-bytes 0) ;; if we need 0 trail bytes, ist just an ascii single byte. (push int byte-list) (progn ;; if we need more than one byte, first fill the trail bytes with 6 bits each. (loop for i from 0 to (1- n-trail-bytes) do (push (+ trail-template (ldb (byte n-trail-bits (* i n-trail-bits)) int)) byte-list)) ;; then copy the remaining content bytes to the lead byte. (push (+ (nth n-trail-bytes lead-templates) (ldb (byte (nth n-trail-bytes n-lead-bits) (* n-trail-bytes n-trail-bits)) int)) byte-list))) ;; return the list of UTF-8 encoded bytes. byte-list))
</lang>
Decoder
<lang lisp> (defun utf-8-to-unicode (byte-list)
"Take a list of one to four utf-8 encoded bytes (octets), return a code point." (let ((b1 (car byte-list))) (cond ((ascii-byte-p b1) b1) ; if a single byte, just return it. ((multi-byte-p b1) (if (lead-byte-p b1) (let ((n (n-trail-bytes b1)) ;; Content bits we want to extract from each lead byte. (lead-templates (list #b01111111 #b00011111 #b00001111 #b00000111)) ;; Content bits we want to extract from each trail byte. (trail-template #b00111111)) (if (= n (1- (list-length byte-list))) ;; add lead byte (+ (ash (logand (nth 0 byte-list) (nth n lead-templates)) (* 6 n)) ;; and the trail bytes (loop for i from 1 to n sum (ash (logand (nth i byte-list) trail-template) (* 6 (- n i))))) (error "calculated number of bytes doesnt match the length of the byte list"))) (error "first byte in the list isnt a lead byte"))))))
</lang>
The test
<lang lisp> (defun test-utf-8 ()
"Return t if the chosen unicode points are encoded and decoded correctly." (let* ((unicodes-orig (list 65 246 1046 8364 119070)) (unicodes-test (mapcar #'(lambda (x) (utf-8-to-unicode (unicode-to-utf-8 x))) unicodes-orig))) (mapcar #'(lambda (x) (format t "character ~A, code point: ~6x, utf-8: ~{~x ~}~%" (code-char x) x (unicode-to-utf-8 x))) unicodes-orig) ;; return t if all are t (every #'= unicodes-orig unicodes-test)))
</lang>
Test output
<lang lisp> CL-USER> (test-utf-8) character A, code point: 41, utf-8: 41 character ö, code point: F6, utf-8: C3 B6 character Ж, code point: 416, utf-8: D0 96 character €, code point: 20AC, utf-8: E2 82 AC character 𝄞, code point: 1D11E, utf-8: F0 9D 84 9E T </lang>
Go
<lang go>package main
import (
"fmt" "unicode/utf8"
)
func utf8encode(codepoint rune) []byte {
buffer := make([]byte, 4) length := utf8.EncodeRune(buffer, codepoint) return buffer[:length]
}
func utf8decode(bytes []byte) rune {
result, _ := utf8.DecodeRune(bytes) return result
}
func main() {
fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded"); for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} { encoded := utf8encode(codepoint) decoded := utf8decode(encoded) fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded) }
}</lang>
- Output:
Char Unicode UTF-8 encoded Decoded A U+0041 41 A ö U+00F6 C3B6 ö Ж U+0416 D096 Ж € U+20AC E282AC € 𝄞 U+1D11E F09D849E 𝄞
Java
<lang java>import java.util.Formatter; import java.io.UnsupportedEncodingException;
public class UTF8EncodeDecode {
public static byte[] utf8encode(int codepoint) throws UnsupportedEncodingException { return new String(new int[]{codepoint}, 0, 1).getBytes("UTF-8"); } public static int utf8decode(byte[] bytes) throws UnsupportedEncodingException { return new String(bytes, "UTF-8").codePointAt(0); } public static final void main(String[] args) throws UnsupportedEncodingException { System.out.printf("%-7s %-43s %7s\t%s\t%7s\n", "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded"); for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) { byte[] encoded = utf8encode(codepoint); Formatter formatter = new Formatter(); for (byte b : encoded) { formatter.format("%02X ", b); } String encodedHex = formatter.toString(); int decoded = utf8decode(encoded); System.out.printf("%-7c %-43s U+%04X\t%-12s\tU+%04X\n", codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded); } } public static final void main(String[] args) throws UnsupportedEncodingException { System.out.printf("%-7s %-43s %7s\t%s\t%s\n", "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded"); for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) { byte[] encoded = utf8encode(codepoint); Formatter formatter = new Formatter(); for (byte b : encoded) { formatter.format("%02X ", b); } String encodedHex = formatter.toString(); int decoded = utf8decode(encoded); System.out.printf("%-7c %-43s U+%04X\t%-12s\t%c\n", codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded); } }
}</lang>
- Output:
Char Name Unicode UTF-8 encoded Decoded A LATIN CAPITAL LETTER A U+0041 41 A ö LATIN SMALL LETTER O WITH DIAERESIS U+00F6 C3 B6 ö Ж CYRILLIC CAPITAL LETTER ZHE U+0416 D0 96 Ж € EURO SIGN U+20AC E2 82 AC € 𝄞 MUSICAL SYMBOL G CLEF U+1D11E F0 9D 84 9E 𝄞
Perl 6
Pretty much all built in to the language. <lang perl6>say sprintf("%-18s %-34s %7s %7s\t%s %s\n", 'Character', 'Name', 'Ordinal', 'Unicode', 'UTF-8 encoded', 'decoded'), '-' x 94;
for < A ö Ж € 𝄞 😜 > -> $char {
printf " %-7s %-43s %6s U+%04s\t%12s %4s\n", $char, $char.uniname, $char.ord, $char.ord.base(16), $char.encode('UTF8').list».base(16).Str, $char.encode('UTF8').decode;
}</lang>
- Output:
Character Name Ordinal Unicode UTF-8 encoded decoded ---------------------------------------------------------------------------------------------- A LATIN CAPITAL LETTER A 65 U+0041 41 A ö LATIN SMALL LETTER O WITH DIAERESIS 246 U+00F6 C3 B6 ö Ж CYRILLIC CAPITAL LETTER ZHE 1046 U+0416 D0 96 Ж € EURO SIGN 8364 U+20AC E2 82 AC € 𝄞 MUSICAL SYMBOL G CLEF 119070 U+1D11E F0 9D 84 9E 𝄞 😜 FACE WITH STUCK-OUT TONGUE AND WINKING EYE 128540 U+1F61C F0 9F 98 9C 😜
Racket
<lang racket>#lang racket
(define char-map
'((LATIN-CAPITAL-LETTER-A . #\U0041) (LATIN-SMALL-LETTER-O-WITH-DIAERESIS . #\U00F6) (CYRILLIC-CAPITAL-LETTER-ZHE . #\U0416) (EURO-SIGN . #\U20AC) (MUSICAL-SYMBOL-G-CLEF . #\U1D11E)))
(for ((name.char (in-list char-map)))
(define name (car name.char)) (define chr (cdr name.char)) (let ((bites (bytes->list (string->bytes/utf-8 (list->string (list chr)))))) (printf "~s\t~a\t~a\t~a\t~a~%" chr chr (map (curryr number->string 16) bites) (bytes->string/utf-8 (list->bytes bites)) name)))</lang>
- Output:
#\A A (41) A LATIN-CAPITAL-LETTER-A #\ö ö (c3 b6) ö LATIN-SMALL-LETTER-O-WITH-DIAERESIS #\Ж Ж (d0 96) Ж CYRILLIC-CAPITAL-LETTER-ZHE #\€ € (e2 82 ac) € EURO-SIGN #\𝄞 𝄞 (f0 9d 84 9e) 𝄞 MUSICAL-SYMBOL-G-CLEF
Sidef
<lang ruby>func utf8_encoder(Number code) {
code.chr.encode('UTF-8').bytes.map{.chr}
}
func utf8_decoder(Array bytes) {
bytes.map{.ord}.decode('UTF-8')
}
for n in ([0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E]) {
var encoded = utf8_encoder(n) var decoded = utf8_decoder(encoded) assert_eq(n, decoded.ord) say "#{decoded} -> #{encoded}"
}</lang>
- Output:
A -> ["A"] ö -> ["\xC3", "\xB6"] Ж -> ["\xD0", "\x96"] € -> ["\xE2", "\x82", "\xAC"] 𝄞 -> ["\xF0", "\x9D", "\x84", "\x9E"]
zkl
<lang zkl>println("Char Unicode UTF-8"); foreach utf,unicode_int in (T( T("\U41;",0x41), T("\Uf6;",0xf6),
T("\U416;",0x416), T("\U20AC;",0x20ac), T("\U1D11E;",0x1d11e))){ utf_int:=utf.reduce(fcn(s,c){ 0x100*s + c.toAsc() },0); char :=unicode_int.toString(-8); // Unicode int to UTF-8 string // UTF-8 bytes to UTF-8 string: char2:=Data(Void,utf_int.toBigEndian(utf_int.len())).text;
println("%s %s %9s %x".fmt(char,char2,"U+%x".fmt(unicode_int),utf_int));
}</lang> Int.len() --> number of bytes in int. This could be hard coded because UTF-8 has a max of 6 bytes and (0x41).toBigEndian(6) --> 0x41,0,0,0,0,0 which is a zero terminated string ("A");
- Output:
Char Unicode UTF-8 A A U+41 41 ö ö U+f6 c3b6 Ж Ж U+416 d096 € € U+20ac e282ac 𝄞 𝄞 U+1d11e f09d849e