UTF-8 encode and decode

From Rosetta Code
Task
UTF-8 encode and decode
You are encouraged to solve this task according to the task description, using any language you may know.

As described in UTF-8 and in Wikipedia, UTF-8 is a popular encoding of (multi-byte) Unicode code-points into eight-bit octets.

The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-4 bytes representing that character in the UTF-8 encoding.

Then you have to write the corresponding decoder that takes a sequence of 1-4 UTF-8 encoded bytes and return the corresponding unicode character.

Demonstrate the functionality of your encoder and decoder on the following five characters:

Character   Name                                  Unicode    UTF-8 encoding (hex)
---------------------------------------------------------------------------------
A           LATIN CAPITAL LETTER A                U+0041     41
ƶ           LATIN SMALL LETTER O WITH DIAERESIS   U+00F6     C3 B6
Š–           CYRILLIC CAPITAL LETTER ZHE           U+0416     D0 96
ā‚¬           EURO SIGN                             U+20AC     E2 82 AC
š„ž           MUSICAL SYMBOL G CLEF                 U+1D11E    F0 9D 84 9E

Provided below is a reference implementation in Common Lisp.

Common Lisp[edit]

Helper functions

 
(defun ascii-byte-p (octet)
"Return t if octet is a single-byte 7-bit ASCII char.
The most significant bit is 0, so the allowed pattern is 0xxx xxxx."

(assert (typep octet 'integer))
(assert (<= (integer-length octet) 8))
(let ((bitmask #b10000000)
(template #b00000000))
;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
;; check if the first two bits are equal to the template #b10000000.
(= (logand bitmask octet) template)))
 
(defun multi-byte-p (octet)
"Return t if octet is a part of a multi-byte UTF-8 sequence.
The multibyte pattern is 1xxx xxxx. A multi-byte can be either a lead byte or a trail byte."

(assert (typep octet 'integer))
(assert (<= (integer-length octet) 8))
(let ((bitmask #b10000000)
(template #b10000000))
;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
;; check if the first two bits are equal to the template #b10000000.
(= (logand bitmask octet) template)))
 
(defun lead-byte-p (octet)
"Return t if octet is one of the leading bytes of an UTF-8 sequence, nil otherwise.
Allowed leading byte patterns are 0xxx xxxx, 110x xxxx, 1110 xxxx and 1111 0xxx."

(assert (typep octet 'integer))
(assert (<= (integer-length octet) 8))
(let ((bitmasks (list #b10000000 #b11100000 #b11110000 #b11111000))
(templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
(some #'(lambda (a b) (= (logand a octet) b)) bitmasks templates)))
 
(defun n-trail-bytes (octet)
"Take a leading utf-8 byte, return the number of continuation bytes 1-3."
(assert (typep octet 'integer))
(assert (<= (integer-length octet) 8))
(let ((bitmasks (list #b10000000 #b11100000 #b11110000 #b11111000))
(templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
(loop for i from 0 to 3
when (= (nth i templates) (logand (nth i bitmasks) octet))
return i)))
 

Encoder

 
(defun unicode-to-utf-8 (int)
"Take a unicode code point, return a list of one to four UTF-8 encoded bytes (octets)."
(assert (<= (integer-length int) 21))
(let ((n-trail-bytes (cond ((<= #x00000 int #x00007F) 0)
((<= #x00080 int #x0007FF) 1)
((<= #x00800 int #x00FFFF) 2)
((<= #x10000 int #x10FFFF) 3)))
(lead-templates (list #b00000000 #b11000000 #b11100000 #b11110000))
(trail-template #b10000000)
;; number of content bits in the lead byte.
(n-lead-bits (list 7 5 4 3))
;; number of content bits in the trail byte.
(n-trail-bits 6)
;; list to put the UTF-8 encoded bytes in.
(byte-list nil))
(if (= n-trail-bytes 0)
;; if we need 0 trail bytes, ist just an ascii single byte.
(push int byte-list)
(progn
;; if we need more than one byte, first fill the trail bytes with 6 bits each.
(loop for i from 0 to (1- n-trail-bytes)
do (push (+ trail-template
(ldb (byte n-trail-bits (* i n-trail-bits)) int))
byte-list))
;; then copy the remaining content bytes to the lead byte.
(push (+ (nth n-trail-bytes lead-templates)
(ldb (byte (nth n-trail-bytes n-lead-bits) (* n-trail-bytes n-trail-bits)) int))
byte-list)))
;; return the list of UTF-8 encoded bytes.
byte-list))
 

Decoder

 
(defun utf-8-to-unicode (byte-list)
"Take a list of one to four utf-8 encoded bytes (octets), return a code point."
(let ((b1 (car byte-list)))
(cond ((ascii-byte-p b1) b1) ; if a single byte, just return it.
((multi-byte-p b1)
(if (lead-byte-p b1)
(let ((n (n-trail-bytes b1))
;; Content bits we want to extract from each lead byte.
(lead-templates (list #b01111111 #b00011111 #b00001111 #b00000111))
;; Content bits we want to extract from each trail byte.
(trail-template #b00111111))
(if (= n (1- (list-length byte-list)))
;; add lead byte
(+ (ash (logand (nth 0 byte-list) (nth n lead-templates)) (* 6 n))
;; and the trail bytes
(loop for i from 1 to n sum
(ash (logand (nth i byte-list) trail-template) (* 6 (- n i)))))
(error "calculated number of bytes doesnt match the length of the byte list")))
(error "first byte in the list isnt a lead byte"))))))
 

The test

 
(defun test-utf-8 ()
"Return t if the chosen unicode points are encoded and decoded correctly."
(let* ((unicodes-orig (list 65 246 1046 8364 119070))
(unicodes-test (mapcar #'(lambda (x) (utf-8-to-unicode (unicode-to-utf-8 x)))
unicodes-orig)))
(mapcar #'(lambda (x)
(format t
"character ~A, code point: ~6x, utf-8: ~{~x ~}~%"
(code-char x)
x
(unicode-to-utf-8 x)))
unicodes-orig)
;; return t if all are t
(every #'= unicodes-orig unicodes-test)))
 

Test output

 
CL-USER> (test-utf-8)
character A, code point: 41, utf-8: 41
character ƶ, code point: F6, utf-8: C3 B6
character Š–, code point: 416, utf-8: D0 96
character ā‚¬, code point: 20AC, utf-8: E2 82 AC
character š„ž, code point: 1D11E, utf-8: F0 9D 84 9E
T
 

D[edit]

import std.conv;
import std.stdio;
 
immutable CHARS = ["A","ƶ","Š–","ā‚¬","š„ž"];
 
void main() {
writeln("Character Code-Point Code-Units");
foreach (c; CHARS) {
auto bytes = cast(ubyte[]) c; //The raw bytes of a character can be accessed by casting
auto unicode = cast(uint) to!dstring(c)[0]; //Convert from a UTF8 string to a UTF32 string, and cast the first character to a number
writefln("%s  %7X [%(%X, %)]", c, unicode, bytes);
}
}
Output:
Character   Code-Point   Code-Units
A                   41   [41]
ƶ                   F6   [C3, B6]
Š–                  416   [D0, 96]
ā‚¬                 20AC   [E2, 82, AC]
š„ž                1D11E   [F0, 9D, 84, 9E]

Go[edit]

Implementation[edit]

This implementation is missing all checks for invalid data and so is not production-ready, but illustrates the basic UTF-8 encoding scheme.

package main
 
import (
"bytes"
"encoding/hex"
"fmt"
"log"
"strings"
)
 
var testCases = []struct {
rune
string
}{
{'A', "41"},
{'ƶ', "C3 B6"},
{'Š–', "D0 96"},
{'ā‚¬', "E2 82 AC"},
{'š„ž', "F0 9D 84 9E"},
}
 
func main() {
for _, tc := range testCases {
// derive some things from test data
u := fmt.Sprintf("U+%04X", tc.rune)
b, err := hex.DecodeString(strings.Replace(tc.string, " ", "", -1))
if err != nil {
log.Fatal("bad test data")
}
// exercise encoder and decoder on test data
e := encodeUTF8(tc.rune)
d := decodeUTF8(b)
// show function return values
fmt.Printf("%c  %-7s  %X\n", d, u, e)
// validate return values against test data
if !bytes.Equal(e, b) {
log.Fatal("encodeUTF8 wrong")
}
if d != tc.rune {
log.Fatal("decodeUTF8 wrong")
}
}
}
 
const (
// first byte of a 2-byte encoding starts 110 and carries 5 bits of data
b2Lead = 0xC0 // 1100 0000
b2Mask = 0x1F // 0001 1111
 
// first byte of a 3-byte encoding starts 1110 and carries 4 bits of data
b3Lead = 0xE0 // 1110 0000
b3Mask = 0x0F // 0000 1111
 
// first byte of a 4-byte encoding starts 11110 and carries 3 bits of data
b4Lead = 0xF0 // 1111 0000
b4Mask = 0x07 // 0000 0111
 
// non-first bytes start 10 and carry 6 bits of data
mbLead = 0x80 // 1000 0000
mbMask = 0x3F // 0011 1111
)
 
func encodeUTF8(r rune) []byte {
switch i := uint32(r); {
case i <= 1<<7-1: // max code point that encodes into a single byte
return []byte{byte(r)}
case i <= 1<<11-1: // into two bytes
return []byte{
b2Lead | byte(r>>6),
mbLead | byte(r)&mbMask}
case i <= 1<<16-1: // three
return []byte{
b3Lead | byte(r>>12),
mbLead | byte(r>>6)&mbMask,
mbLead | byte(r)&mbMask}
default:
return []byte{
b4Lead | byte(r>>18),
mbLead | byte(r>>12)&mbMask,
mbLead | byte(r>>6)&mbMask,
mbLead | byte(r)&mbMask}
}
}
 
func decodeUTF8(b []byte) rune {
switch b0 := b[0]; {
case b0 < 0x80:
return rune(b0)
case b0 < 0xE0:
return rune(b0&b2Mask)<<6 |
rune(b[1]&mbMask)
case b0 < 0xF0:
return rune(b0&b3Mask)<<12 |
rune(b[1]&mbMask)<<6 |
rune(b[2]&mbMask)
default:
return rune(b0&b4Mask)<<18 |
rune(b[1]&mbMask)<<12 |
rune(b[2]&mbMask)<<6 |
rune(b[3]&mbMask)
}
}
Output:
A  U+0041   41
ƶ  U+00F6   C3B6
Š–  U+0416   D096
ā‚¬  U+20AC   E282AC
š„ž  U+1D11E  F09D849E

Library/language[edit]

package main
 
import (
"fmt"
"unicode/utf8"
)
 
func utf8encode(codepoint rune) []byte {
buffer := make([]byte, 4)
length := utf8.EncodeRune(buffer, codepoint)
return buffer[:length]
}
 
func utf8decode(bytes []byte) rune {
result, _ := utf8.DecodeRune(bytes)
return result
}
 
func main() {
fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
for _, codepoint := range []rune{'A', 'ƶ', 'Š–', 'ā‚¬', 'š„ž'} {
encoded := utf8encode(codepoint)
decoded := utf8decode(encoded)
fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
}
}
Output:
Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ƶ       U+00F6	C3B6        	ƶ
Š–       U+0416	D096        	Š–
ā‚¬       U+20AC	E282AC      	ā‚¬
š„ž       U+1D11E	F09D849E    	š„ž

Alternately:

package main
 
import (
"fmt"
)
 
func utf8encode(codepoint rune) []byte {
return []byte(string([]rune{codepoint}))
}
 
func utf8decode(bytes []byte) rune {
return []rune(string(bytes))[0]
}
 
func main() {
fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
for _, codepoint := range []rune{'A', 'ƶ', 'Š–', 'ā‚¬', 'š„ž'} {
encoded := utf8encode(codepoint)
decoded := utf8decode(encoded)
fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
}
}
Output:
Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ƶ       U+00F6	C3B6        	ƶ
Š–       U+0416	D096        	Š–
ā‚¬       U+20AC	E282AC      	ā‚¬
š„ž       U+1D11E	F09D849E    	š„ž

Java[edit]

Works with: Java version 7+
import java.nio.charset.StandardCharsets;
import java.util.Formatter;
 
public class UTF8EncodeDecode {
 
public static byte[] utf8encode(int codepoint) {
return new String(new int[]{codepoint}, 0, 1).getBytes(StandardCharsets.UTF_8);
}
 
public static int utf8decode(byte[] bytes) {
return new String(bytes, StandardCharsets.UTF_8).codePointAt(0);
}
 
public static void main(String[] args) {
System.out.printf("%-7s %-43s %7s\t%s\t%7s%n",
"Char", "Name", "Unicode", "UTF-8 encoded", "Decoded");
 
for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) {
byte[] encoded = utf8encode(codepoint);
Formatter formatter = new Formatter();
for (byte b : encoded) {
formatter.format("%02X ", b);
}
String encodedHex = formatter.toString();
int decoded = utf8decode(encoded);
System.out.printf("%-7c %-43s U+%04X\t%-12s\tU+%04X%n",
codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded);
}
}
}
Output:
Char    Name                                        Unicode	UTF-8 encoded	Decoded
A       LATIN CAPITAL LETTER A                      U+0041	41          	A
ƶ       LATIN SMALL LETTER O WITH DIAERESIS         U+00F6	C3 B6       	ƶ
Š–       CYRILLIC CAPITAL LETTER ZHE                 U+0416	D0 96       	Š–
ā‚¬       EURO SIGN                                   U+20AC	E2 82 AC    	ā‚¬
š„ž      MUSICAL SYMBOL G CLEF                       U+1D11E	F0 9D 84 9E 	š„ž

JavaScript[edit]

An implementation in ECMAScript 2015 (ES6):

 
/***************************************************************************\
|* Pure UTF-8 handling without detailed error reporting functionality. *|
|***************************************************************************|
|* utf8encode *|
|* < String character or UInt32 code point *|
|* > Uint8Array encoded_character *|
|* | ErrorString *|
|* *|
|* utf8encode takes a string or uint32 representing a single code point *|
|* as its argument and returns an array of length 1 up to 4 containing *|
|* utf8 code units representing that character. *|
|***************************************************************************|
|* utf8decode *|
|* < Unit8Array [highendbyte highmidendbyte lowmidendbyte lowendbyte] *|
|* > uint32 character *|
|* | ErrorString *|
|* *|
|* utf8decode takes an array of one to four uint8 representing utf8 code *|
|* units and returns a uint32 representing that code point. *|
\***************************************************************************/

 
const
utf8encode=
n=>
(m=>
m<0x80
?Uint8Array.from(
[ m>>0&0x7f|0x00])
:m<0x800
?Uint8Array.from(
[ m>>6&0x1f|0xc0,m>>0&0x3f|0x80])
:m<0x10000
?Uint8Array.from(
[ m>>12&0x0f|0xe0,m>>6&0x3f|0x80,m>>0&0x3f|0x80])
:m<0x110000
?Uint8Array.from(
[ m>>18&0x07|0xf0,m>>12&0x3f|0x80,m>>6&0x3f|0x80,m>>0&0x3f|0x80])
:(()=>{throw'Invalid Unicode Code Point!'})())
( typeof n==='string'
?n.codePointAt(0)
:n&0x1fffff),
utf8decode=
([m,n,o,p])=>
m<0x80
?( m&0x7f)<<0
:0xc1<m&&m<0xe0&&n===(n&0xbf)
?( m&0x1f)<<6|( n&0x3f)<<0
:( m===0xe0&&0x9f<n&&n<0xc0
||0xe0<m&&m<0xed&&0x7f<n&&n<0xc0
||m===0xed&&0x7f<n&&n<0xa0
||0xed<m&&m<0xf0&&0x7f<n&&n<0xc0)
&&o===o&0xbf
?( m&0x0f)<<12|( n&0x3f)<<6|( o&0x3f)<<0
:( m===0xf0&&0x8f<n&&n<0xc0
||m===0xf4&&0x7f<n&&n<0x90
||0xf0<m&&m<0xf4&&0x7f<n&&n<0xc0)
&&o===o&0xbf&&p===p&0xbf
?( m&0x07)<<18|( n&0x3f)<<12|( o&0x3f)<<6|( p&0x3f)<<0
:(()=>{throw'Invalid UTF-8 encoding!'})()
 

The testing inputs:

 
const
str=
'AƶŠ–ā‚¬š„ž'
,cps=
Uint32Array.from(str,s=>s.codePointAt(0))
,cus=
[ [ 0x41]
,[ 0xc3,0xb6]
,[ 0xd0,0x96]
,[ 0xe2,0x82,0xac]
,[ 0xf0,0x9d,0x84,0x9e]]
.map(a=>Uint8Array.from(a))
,zip3=
([a,...as],[b,...bs],[c,...cs])=>
0<as.length+bs.length+cs.length
?[ [ a,b,c],...zip3(as,bs,cs)]
:[ [ a,b,c]]
,inputs=zip3(str,cps,cus);
 

The testing code:

 
console.log(`\
${'Character'.padEnd(16)}\
${'CodePoint'.padEnd(16)}\
${'CodeUnits'.padEnd(16)}\
${'uft8encode(ch)'.padEnd(16)}\
${'uft8encode(cp)'.padEnd(16)}\
utf8decode(cu)`)
for(let [ch,cp,cu] of inputs)
console.log(`\
${ch.padEnd(16)}\
${cp.toString(0x10).padStart(8,'U+000000').padEnd(16)}\
${`[${[...cu].map(n=>n.toString(0x10))}]`.padEnd(16)}\
${`[${[...utf8encode(ch)].map(n=>n.toString(0x10))}]`.padEnd(16)}\
${`[${[...utf8encode(cp)].map(n=>n.toString(0x10))}]`.padEnd(16)}\
${utf8decode(cu).toString(0x10).padStart(8,'U+000000')}`)
 

and finally, the output from the test:

Character       CodePoint       CodeUnits       uft8encode(ch)  uft8encode(cp)  utf8decode(cu)
A               U+000041        [41]            [41]            [41]            U+000041
ƶ               U+0000f6        [c3,b6]         [c3,b6]         [c3,b6]         U+0000f6
Š–               U+000416        [d0,96]         [d0,96]         [d0,96]         U+000416
ā‚¬               U+0020ac        [e2,82,ac]      [e2,82,ac]      [e2,82,ac]      U+0020ac
š„ž              U+01d11e        [f0,9d,84,9e]   [f0,9d,84,9e]   [f0,9d,84,9e]   U+01d11e
Note that the misalign there on the last line is caused by the string length of astral characters being 2 so the padding functions break.

Kotlin[edit]

// version 1.1.2
 
fun utf8Encode(codePoint: Int) = String(intArrayOf(codePoint), 0, 1).toByteArray(Charsets.UTF_8)
 
fun utf8Decode(bytes: ByteArray) = String(bytes, Charsets.UTF_8).codePointAt(0)
 
fun main(args: Array<String>) {
val codePoints = intArrayOf(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)
println("Char Name Unicode UTF-8 Decoded")
for (codePoint in codePoints) {
var n = if(codePoint <= 0xFFFF) 4 else 5
System.out.printf("%-${n}c  %-35s U+%05X ", codePoint, Character.getName(codePoint), codePoint)
val bytes = utf8Encode(codePoint)
var s = ""
for (byte in bytes) s += "%02X ".format(byte)
val decoded = utf8Decode(bytes)
n = if(decoded.toInt() <= 0xFFFF) 12 else 11
System.out.printf("%-${n}s  %c\n", s, decoded)
}
}
Output:
Char  Name                                 Unicode  UTF-8         Decoded
A     LATIN CAPITAL LETTER A               U+00041  41            A
ƶ     LATIN SMALL LETTER O WITH DIAERESIS  U+000F6  C3 B6         ƶ
Š–     CYRILLIC CAPITAL LETTER ZHE          U+00416  D0 96         Š–
ā‚¬     EURO SIGN                            U+020AC  E2 82 AC      ā‚¬
š„ž     MUSICAL SYMBOL G CLEF                U+1D11E  F0 9D 84 9E   š„ž

Mathematica[edit]

utf = ToCharacterCode[ToString["AƶŠ–ā‚¬", CharacterEncoding -> "UTF8"]]
ToCharacterCode[FromCharacterCode[utf, "UTF8"]]

{{out}

{65, 195, 182, 208, 150, 226, 130, 172}
{65, 246, 1046, 8364}

Perl 6[edit]

Works with: Rakudo version 2017.02

Pretty much all built in to the language.

say sprintf("%-18s %-34s %7s %7s\t%s  %s\n", 'Character', 'Name', 'Ordinal', 'Unicode', 'UTF-8 encoded', 'decoded'), '-' x 94;
 
for < A ƶ Š– ā‚¬ š„ž šŸ˜œ > -> $char {
printf "  %-7s %-43s %6s U+%04s\t%12s %4s\n", $char, $char.uniname, $char.ord,
$char.ord.base(16), $char.encode('UTF8').listĀ».base(16).Str, $char.encode('UTF8').decode;
}
Output:
Character          Name                               Ordinal Unicode	UTF-8 encoded  decoded
----------------------------------------------------------------------------------------------
   A       LATIN CAPITAL LETTER A                          65 U+0041	          41    A
   ƶ       LATIN SMALL LETTER O WITH DIAERESIS            246 U+00F6	       C3 B6    ƶ
   Š–       CYRILLIC CAPITAL LETTER ZHE                   1046 U+0416	       D0 96    Š–
   ā‚¬       EURO SIGN                                     8364 U+20AC	    E2 82 AC    ā‚¬
   š„ž       MUSICAL SYMBOL G CLEF                       119070 U+1D11E	 F0 9D 84 9E    š„ž
   šŸ˜œ      FACE WITH STUCK-OUT TONGUE AND WINKING EYE  128540 U+1F61C	 F0 9F 98 9C    šŸ˜œ

Phix[edit]

Standard autoinclude, see the manual and/or builtins/utfconv.e ( http://phix.x10.mx/docs/html/utfconv.htm and/or https://bitbucket.org/petelomax/phix/src )
As requested in the task description:

constant tests = {#0041, #00F6, #0416, #20AC, #1D11E}
 
function hex(sequence s, string fmt) -- output helper
for i=1 to length(s) do
s[i] = sprintf(fmt,s[i])
end for
return join(s,',')
end function
 
for i=1 to length(tests) do
integer codepoint = tests[i]
sequence s = utf32_to_utf8({codepoint}),
r = utf8_to_utf32(s)
printf(1,"#%04x -> {%s} -> {%s}\n",{codepoint, hex(s,"#%02x"),hex(r,"#%04x")})
end for
Output:
#0041 -> {#41} -> {#0041}
#00F6 -> {#C3,#B6} -> {#00F6}
#0416 -> {#D0,#96} -> {#0416}
#20AC -> {#E2,#82,#AC} -> {#20AC}
#1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E}

Python[edit]

 
#!/usr/bin/env python3
from unicodedata import name
 
 
def unicode_code(ch):
return 'U+{:04x}'.format(ord(ch))
 
 
def utf8hex(ch):
return " ".join([hex(c)[2:] for c in ch.encode('utf8')]).upper()
 
 
if __name__ == "__main__":
print('{:<11} {:<36} {:<15} {:<15}'.format('Character', 'Name', 'Unicode', 'UTF-8 encoding (hex)'))
chars = ['A', 'ƶ', 'Š–', 'ā‚¬', 'š„ž']
for char in chars:
print('{:<11} {:<36} {:<15} {:<15}'.format(char, name(char), unicode_code(char), utf8hex(char)))
Output:
Character   Name                                 Unicode         UTF-8 encoding (hex)
A           LATIN CAPITAL LETTER A               U+0041          41             
ƶ           LATIN SMALL LETTER O WITH DIAERESIS  U+00f6          C3 B6          
Š–           CYRILLIC CAPITAL LETTER ZHE          U+0416          D0 96          
ā‚¬           EURO SIGN                            U+20ac          E2 82 AC       
š„ž           MUSICAL SYMBOL G CLEF                U+1d11e         F0 9D 84 9E

Racket[edit]

#lang racket
 
(define char-map
'((LATIN-CAPITAL-LETTER-A . #\U0041)
(LATIN-SMALL-LETTER-O-WITH-DIAERESIS . #\U00F6)
(CYRILLIC-CAPITAL-LETTER-ZHE . #\U0416)
(EURO-SIGN . #\U20AC)
(MUSICAL-SYMBOL-G-CLEF . #\U1D11E)))
 
(for ((name.char (in-list char-map)))
(define name (car name.char))
(define chr (cdr name.char))
(let ((bites (bytes->list (string->bytes/utf-8 (list->string (list chr))))))
(printf "~s\t~a\t~a\t~a\t~a~%" chr chr
(map (curryr number->string 16) bites)
(bytes->string/utf-8 (list->bytes bites))
name)))
Output:
#\A	A	(41)	A	LATIN-CAPITAL-LETTER-A
#\ƶ	ƶ	(c3 b6)	ƶ	LATIN-SMALL-LETTER-O-WITH-DIAERESIS
#\Š–	Š–	(d0 96)	Š–	CYRILLIC-CAPITAL-LETTER-ZHE
#\ā‚¬	ā‚¬	(e2 82 ac)	ā‚¬	EURO-SIGN
#\š„ž	š„ž	(f0 9d 84 9e)	š„ž	MUSICAL-SYMBOL-G-CLEF

Sidef[edit]

func utf8_encoder(Number code) {
code.chr.encode('UTF-8').bytes.map{.chr}
}
 
func utf8_decoder(Array bytes) {
bytes.map{.ord}.decode('UTF-8')
}
 
for n in ([0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E]) {
var encoded = utf8_encoder(n)
var decoded = utf8_decoder(encoded)
assert_eq(n, decoded.ord)
say "#{decoded} -> #{encoded}"
}
Output:
A -> ["A"]
ƶ -> ["\xC3", "\xB6"]
Š– -> ["\xD0", "\x96"]
ā‚¬ -> ["\xE2", "\x82", "\xAC"]
š„ž -> ["\xF0", "\x9D", "\x84", "\x9E"]

Tcl[edit]

Note: Tcl can handle Unicodes only up to U+FFFD, i.e. the Basic Multilingual Plane (BMP, 16 bits wide). Therefore, the fifth test fails as expected.

proc encoder int {
set u [format %c $int]
set bytes {}
foreach byte [split [encoding convertto utf-8 $u] ""] {
lappend bytes [format %02X [scan $byte %c]]
}
return $bytes
}
proc decoder bytes {
set str {}
foreach byte $bytes {
append str [format %c [scan $byte %x]]
}
return [encoding convertfrom utf-8 $str]
}
foreach test {0x0041 0x00f6 0x0416 0x20ac 0x1d11e} {
set res $test
lappend res [encoder $test] -> [decoder [encoder $test]]
puts $res
}
0x0041 41 -> A
0x00f6 {C3 B6} -> ƶ
0x0416 {D0 96} -> Š–
0x20ac {E2 82 AC} -> ā‚¬
0x1d11e {EF BF BD} -> ļæ½

Alternative Implementation[edit]

While perhaps not as readable as the above, this version handles beyond-BMP codepoints by manually composing the utf-8 byte sequences and emitting raw bytes to the console. encoding convertto utf-8 command still does the heavy lifting where it can.

proc utf8 {codepoint} {
scan $codepoint %llx cp
if {$cp < 0x10000} {
set str [subst \\u$codepoint] ;# substitute per Tcl backslash rule
set bytes [encoding convertto utf-8 $str] ;# encode
} else { ;# codepoints beyond the BMP need manual approach
set bits [format %021b $cp] ;# format as binary string
set unibits 11110[string range $bits 0 2];# insert extra bits for utf-8 4-byte encoding
append unibits 10[string range $bits 3 8]
append unibits 10[string range $bits 9 14]
append unibits 10[string range $bits 15 20]
set bytes [binary format B* $unibits] ;# turn into a sequence of bytes
}
return $bytes
}
 
proc hexchars {s} {
binary scan $s H* hex
regsub -all .. $hex {\0 }
}
 
# for the test, we assume the tty is in utf-8 mode and can handle beyond-BMP chars
# so set output mode to binary so we can write raw bytes!
chan configure stdout -encoding binary
foreach codepoint { 41 F6 416 20AC 1D11E } {
set utf8 [utf8 $codepoint]
puts "[format U+%04s $codepoint]\t$utf8\t[hexchars $utf8]"
}
Output:
U+0041  A       41

U+00F6 ƶ c3 b6 U+0416 Š– d0 96 U+20AC ā‚¬ e2 82 ac U+1D11E š„ž f0 9d 84 9e

zkl[edit]

println("Char  Unicode  UTF-8");
foreach utf,unicode_int in (T( T("\U41;",0x41), T("\Uf6;",0xf6),
T("\U416;",0x416), T("\U20AC;",0x20ac), T("\U1D11E;",0x1d11e))){
utf_int:=utf.reduce(fcn(s,c){ 0x100*s + c.toAsc() },0);
char :=unicode_int.toString(-8); // Unicode int to UTF-8 string
// UTF-8 bytes to UTF-8 string:
char2:=Data(Void,utf_int.toBigEndian(utf_int.len())).text;
 
println("%s %s %9s  %x".fmt(char,char2,"U+%x".fmt(unicode_int),utf_int));
}

Int.len() --> number of bytes in int. This could be hard coded because UTF-8 has a max of 6 bytes and (0x41).toBigEndian(6) --> 0x41,0,0,0,0,0 which is a zero terminated string ("A");

Output:
Char  Unicode  UTF-8
A A      U+41  41
ƶ ƶ      U+f6  c3b6
Š– Š–     U+416  d096
ā‚¬ ā‚¬    U+20ac  e282ac
š„ž š„ž   U+1d11e  f09d849e