CloudFlare suffered a massive security issue affecting all of its customers, including Rosetta Code. All passwords not changed since February 19th 2017 have been expired, and session cookie longevity will be reduced until late March.--Michael Mol (talk) 05:15, 25 February 2017 (UTC)

UTF-8 encode and decode

From Rosetta Code
Task
UTF-8 encode and decode
You are encouraged to solve this task according to the task description, using any language you may know.

As described in UTF-8 and in Wikipedia, UTF-8 is a popular encoding of (multi-byte) Unicode code-points into eight-bit octets.

The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-4 bytes representing that character in the UTF-8 encoding.

Then you have to write the corresponding decoder that takes a sequence of 1-4 UTF-8 encoded bytes and return the corresponding unicode character.

Demonstrate the functionality of your encoder and decoder on the following five characters:

Character   Name                                  Unicode    UTF-8 encoding (hex)
---------------------------------------------------------------------------------
A           LATIN CAPITAL LETTER A                U+0041     41
ΓΆ           LATIN SMALL LETTER O WITH DIAERESIS   U+00F6     C3 B6
Π–           CYRILLIC CAPITAL LETTER ZHE           U+0416     D0 96
€           EURO SIGN                             U+20AC     E2 82 AC
π„ž           MUSICAL SYMBOL G CLEF                 U+1D11E    F0 9D 84 9E

Provided below is a reference implementation in Common Lisp.

Common Lisp[edit]

Helper functions

 
(defun ascii-byte-p (octet)
"Return t if octet is a single-byte 7-bit ASCII char.
The most significant bit is 0, so the allowed pattern is 0xxx xxxx."

(assert (typep octet 'integer))
(assert (<= (integer-length octet) 8))
(let ((bitmask #b10000000)
(template #b00000000))
;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
;; check if the first two bits are equal to the template #b10000000.
(= (logand bitmask octet) template)))
 
(defun multi-byte-p (octet)
"Return t if octet is a part of a multi-byte UTF-8 sequence.
The multibyte pattern is 1xxx xxxx. A multi-byte can be either a lead byte or a trail byte."

(assert (typep octet 'integer))
(assert (<= (integer-length octet) 8))
(let ((bitmask #b10000000)
(template #b10000000))
;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
;; check if the first two bits are equal to the template #b10000000.
(= (logand bitmask octet) template)))
 
(defun lead-byte-p (octet)
"Return t if octet is one of the leading bytes of an UTF-8 sequence, nil otherwise.
Allowed leading byte patterns are 0xxx xxxx, 110x xxxx, 1110 xxxx and 1111 0xxx."

(assert (typep octet 'integer))
(assert (<= (integer-length octet) 8))
(let ((bitmasks (list #b10000000 #b11100000 #b11110000 #b11111000))
(templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
(some #'(lambda (a b) (= (logand a octet) b)) bitmasks templates)))
 
(defun n-trail-bytes (octet)
"Take a leading utf-8 byte, return the number of continuation bytes 1-3."
(assert (typep octet 'integer))
(assert (<= (integer-length octet) 8))
(let ((bitmasks (list #b10000000 #b11100000 #b11110000 #b11111000))
(templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
(loop for i from 0 to 3
when (= (nth i templates) (logand (nth i bitmasks) octet))
return i)))
 

Encoder

 
(defun unicode-to-utf-8 (int)
"Take a unicode code point, return a list of one to four UTF-8 encoded bytes (octets)."
(assert (<= (integer-length int) 21))
(let ((n-trail-bytes (cond ((<= #x00000 int #x00007F) 0)
((<= #x00080 int #x0007FF) 1)
((<= #x00800 int #x00FFFF) 2)
((<= #x10000 int #x10FFFF) 3)))
(lead-templates (list #b00000000 #b11000000 #b11100000 #b11110000))
(trail-template #b10000000)
;; number of content bits in the lead byte.
(n-lead-bits (list 7 5 4 3))
;; number of content bits in the trail byte.
(n-trail-bits 6)
;; list to put the UTF-8 encoded bytes in.
(byte-list nil))
(if (= n-trail-bytes 0)
;; if we need 0 trail bytes, ist just an ascii single byte.
(push int byte-list)
(progn
;; if we need more than one byte, first fill the trail bytes with 6 bits each.
(loop for i from 0 to (1- n-trail-bytes)
do (push (+ trail-template
(ldb (byte n-trail-bits (* i n-trail-bits)) int))
byte-list))
;; then copy the remaining content bytes to the lead byte.
(push (+ (nth n-trail-bytes lead-templates)
(ldb (byte (nth n-trail-bytes n-lead-bits) (* n-trail-bytes n-trail-bits)) int))
byte-list)))
;; return the list of UTF-8 encoded bytes.
byte-list))
 

Decoder

 
(defun utf-8-to-unicode (byte-list)
"Take a list of one to four utf-8 encoded bytes (octets), return a code point."
(let ((b1 (car byte-list)))
(cond ((ascii-byte-p b1) b1) ; if a single byte, just return it.
((multi-byte-p b1)
(if (lead-byte-p b1)
(let ((n (n-trail-bytes b1))
;; Content bits we want to extract from each lead byte.
(lead-templates (list #b01111111 #b00011111 #b00001111 #b00000111))
;; Content bits we want to extract from each trail byte.
(trail-template #b00111111))
(if (= n (1- (list-length byte-list)))
;; add lead byte
(+ (ash (logand (nth 0 byte-list) (nth n lead-templates)) (* 6 n))
;; and the trail bytes
(loop for i from 1 to n sum
(ash (logand (nth i byte-list) trail-template) (* 6 (- n i)))))
(error "calculated number of bytes doesnt match the length of the byte list")))
(error "first byte in the list isnt a lead byte"))))))
 

The test

 
(defun test-utf-8 ()
"Return t if the chosen unicode points are encoded and decoded correctly."
(let* ((unicodes-orig (list 65 246 1046 8364 119070))
(unicodes-test (mapcar #'(lambda (x) (utf-8-to-unicode (unicode-to-utf-8 x)))
unicodes-orig)))
(mapcar #'(lambda (x)
(format t
"character ~A, code point: ~6x, utf-8: ~{~x ~}~%"
(code-char x)
x
(unicode-to-utf-8 x)))
unicodes-orig)
;; return t if all are t
(every #'= unicodes-orig unicodes-test)))
 

Test output

 
CL-USER> (test-utf-8)
character A, code point: 41, utf-8: 41
character ΓΆ, code point: F6, utf-8: C3 B6
character Π–, code point: 416, utf-8: D0 96
character €, code point: 20AC, utf-8: E2 82 AC
character π„ž, code point: 1D11E, utf-8: F0 9D 84 9E
T
 

Go[edit]

package main
 
import (
"fmt"
"unicode/utf8"
)
 
func utf8encode(codepoint rune) []byte {
buffer := make([]byte, 4)
length := utf8.EncodeRune(buffer, codepoint)
return buffer[:length]
}
 
func utf8decode(bytes []byte) rune {
result, _ := utf8.DecodeRune(bytes)
return result
}
 
func main() {
fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
for _, codepoint := range []rune{'A', 'ΓΆ', 'Π–', '€', 'π„ž'} {
encoded := utf8encode(codepoint)
decoded := utf8decode(encoded)
fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
}
}
Output:
Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ΓΆ       U+00F6	C3B6        	ΓΆ
Π–       U+0416	D096        	Π–
€       U+20AC	E282AC      	€
π„ž       U+1D11E	F09D849E    	π„ž

Alternately:

package main
 
import (
"fmt"
)
 
func utf8encode(codepoint rune) []byte {
return []byte(string([]rune{codepoint}))
}
 
func utf8decode(bytes []byte) rune {
return []rune(string(bytes))[0]
}
 
func main() {
fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
for _, codepoint := range []rune{'A', 'ΓΆ', 'Π–', '€', 'π„ž'} {
encoded := utf8encode(codepoint)
decoded := utf8decode(encoded)
fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
}
}
Output:
Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ΓΆ       U+00F6	C3B6        	ΓΆ
Π–       U+0416	D096        	Π–
€       U+20AC	E282AC      	€
π„ž       U+1D11E	F09D849E    	π„ž

Java[edit]

Works with: Java version 7+
import java.util.Formatter;
import java.io.UnsupportedEncodingException;
 
public class UTF8EncodeDecode {
public static byte[] utf8encode(int codepoint) throws UnsupportedEncodingException {
return new String(new int[]{codepoint}, 0, 1).getBytes("UTF-8");
}
public static int utf8decode(byte[] bytes) throws UnsupportedEncodingException {
return new String(bytes, "UTF-8").codePointAt(0);
}
public static final void main(String[] args) throws UnsupportedEncodingException {
System.out.printf("%-7s %-43s %7s\t%s\t%7s\n", "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded");
for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) {
byte[] encoded = utf8encode(codepoint);
Formatter formatter = new Formatter();
for (byte b : encoded) {
formatter.format("%02X ", b);
}
String encodedHex = formatter.toString();
int decoded = utf8decode(encoded);
System.out.printf("%-7c %-43s U+%04X\t%-12s\tU+%04X\n", codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded);
}
}
public static final void main(String[] args) throws UnsupportedEncodingException {
System.out.printf("%-7s %-43s %7s\t%s\t%s\n", "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded");
for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) {
byte[] encoded = utf8encode(codepoint);
Formatter formatter = new Formatter();
for (byte b : encoded) {
formatter.format("%02X ", b);
}
String encodedHex = formatter.toString();
int decoded = utf8decode(encoded);
System.out.printf("%-7c %-43s U+%04X\t%-12s\t%c\n", codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded);
}
}
}
Output:
Char    Name                                        Unicode	UTF-8 encoded	Decoded
A       LATIN CAPITAL LETTER A                      U+0041	41          	A
ΓΆ       LATIN SMALL LETTER O WITH DIAERESIS         U+00F6	C3 B6       	ΓΆ
Π–       CYRILLIC CAPITAL LETTER ZHE                 U+0416	D0 96       	Π–
€       EURO SIGN                                   U+20AC	E2 82 AC    	€
π„ž      MUSICAL SYMBOL G CLEF                       U+1D11E	F0 9D 84 9E 	π„ž

Perl 6[edit]

Works with: Rakudo version 2017.02

Pretty much all built in to the language.

say sprintf("%-18s %-34s %7s %7s\t%s  %s\n", 'Character', 'Name', 'Ordinal', 'Unicode', 'UTF-8 encoded', 'decoded'), '-' x 94;
 
for < A ΓΆ Π– € π„ž 😜 > -> $char {
printf "  %-7s %-43s %6s U+%04s\t%12s %4s\n", $char, $char.uniname, $char.ord,
$char.ord.base(16), $char.encode('UTF8').listΒ».base(16).Str, $char.encode('UTF8').decode;
}
Output:
Character          Name                               Ordinal Unicode	UTF-8 encoded  decoded
----------------------------------------------------------------------------------------------
   A       LATIN CAPITAL LETTER A                          65 U+0041	          41    A
   ΓΆ       LATIN SMALL LETTER O WITH DIAERESIS            246 U+00F6	       C3 B6    ΓΆ
   Π–       CYRILLIC CAPITAL LETTER ZHE                   1046 U+0416	       D0 96    Π–
   €       EURO SIGN                                     8364 U+20AC	    E2 82 AC    €
   π„ž       MUSICAL SYMBOL G CLEF                       119070 U+1D11E	 F0 9D 84 9E    π„ž
   😜      FACE WITH STUCK-OUT TONGUE AND WINKING EYE  128540 U+1F61C	 F0 9F 98 9C    😜

Phix[edit]

Standard autoinclude, see the manual and/or builtins/utfconv.e ( http://phix.x10.mx/docs/html/utfconv.htm and/or https://bitbucket.org/petelomax/phix/src )
As requested in the task description:

constant tests = {#0041, #00F6, #0416, #20AC, #1D11E}
 
function hex(sequence s, string fmt) -- output helper
for i=1 to length(s) do
s[i] = sprintf(fmt,s[i])
end for
return join(s,',')
end function
 
for i=1 to length(tests) do
integer codepoint = tests[i]
sequence s = utf32_to_utf8({codepoint}),
r = utf8_to_utf32(s)
printf(1,"#%04x -> {%s} -> {%s}\n",{codepoint, hex(s,"#%02x"),hex(r,"#%04x")})
end for
Output:
#0041 -> {#41} -> {#0041}
#00F6 -> {#C3,#B6} -> {#00F6}
#0416 -> {#D0,#96} -> {#0416}
#20AC -> {#E2,#82,#AC} -> {#20AC}
#1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E}

Racket[edit]

#lang racket
 
(define char-map
'((LATIN-CAPITAL-LETTER-A . #\U0041)
(LATIN-SMALL-LETTER-O-WITH-DIAERESIS . #\U00F6)
(CYRILLIC-CAPITAL-LETTER-ZHE . #\U0416)
(EURO-SIGN . #\U20AC)
(MUSICAL-SYMBOL-G-CLEF . #\U1D11E)))
 
(for ((name.char (in-list char-map)))
(define name (car name.char))
(define chr (cdr name.char))
(let ((bites (bytes->list (string->bytes/utf-8 (list->string (list chr))))))
(printf "~s\t~a\t~a\t~a\t~a~%" chr chr
(map (curryr number->string 16) bites)
(bytes->string/utf-8 (list->bytes bites))
name)))
Output:
#\A	A	(41)	A	LATIN-CAPITAL-LETTER-A
#\ΓΆ	ΓΆ	(c3 b6)	ΓΆ	LATIN-SMALL-LETTER-O-WITH-DIAERESIS
#\Π–	Π–	(d0 96)	Π–	CYRILLIC-CAPITAL-LETTER-ZHE
#\€	€	(e2 82 ac)	€	EURO-SIGN
#\π„ž	π„ž	(f0 9d 84 9e)	π„ž	MUSICAL-SYMBOL-G-CLEF

Sidef[edit]

func utf8_encoder(Number code) {
code.chr.encode('UTF-8').bytes.map{.chr}
}
 
func utf8_decoder(Array bytes) {
bytes.map{.ord}.decode('UTF-8')
}
 
for n in ([0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E]) {
var encoded = utf8_encoder(n)
var decoded = utf8_decoder(encoded)
assert_eq(n, decoded.ord)
say "#{decoded} -> #{encoded}"
}
Output:
A -> ["A"]
ΓΆ -> ["\xC3", "\xB6"]
Π– -> ["\xD0", "\x96"]
€ -> ["\xE2", "\x82", "\xAC"]
π„ž -> ["\xF0", "\x9D", "\x84", "\x9E"]

zkl[edit]

println("Char  Unicode  UTF-8");
foreach utf,unicode_int in (T( T("\U41;",0x41), T("\Uf6;",0xf6),
T("\U416;",0x416), T("\U20AC;",0x20ac), T("\U1D11E;",0x1d11e))){
utf_int:=utf.reduce(fcn(s,c){ 0x100*s + c.toAsc() },0);
char :=unicode_int.toString(-8); // Unicode int to UTF-8 string
// UTF-8 bytes to UTF-8 string:
char2:=Data(Void,utf_int.toBigEndian(utf_int.len())).text;
 
println("%s %s %9s  %x".fmt(char,char2,"U+%x".fmt(unicode_int),utf_int));
}

Int.len() --> number of bytes in int. This could be hard coded because UTF-8 has a max of 6 bytes and (0x41).toBigEndian(6) --> 0x41,0,0,0,0,0 which is a zero terminated string ("A");

Output:
Char  Unicode  UTF-8
A A      U+41  41
ΓΆ ΓΆ      U+f6  c3b6
Π– Π–     U+416  d096
€ €    U+20ac  e282ac
π„ž π„ž   U+1d11e  f09d849e