UTF-8 encode and decode: Difference between revisions

(Added 11l)
Line 1,320:
{65, 246, 1046, 8364}
</pre>
 
=={{header|Nim}}==
 
=== Using the standard library ===
Nim strings are encoded in UTF-8. The natural way to deal with UTF-8 and Unicode code points consists to use the module “unicode” which provides procedures to convert from strings to sequences of code points (names “runes”) and conversely.
For this purpose, using sequences or bytes is not natural. Here is a way to proceed using the module “unicode”.
 
<lang Nim>import unicode, sequtils, strformat, strutils
 
const UChars = ["\u0041", "\u00F6", "\u0416", "\u20AC", "\u{1D11E}"]
 
proc toSeqByte(r: Rune): seq[byte] =
let s = r.toUTF8
result = @(s.toOpenArrayByte(0, s.high))
 
proc toRune(s: seq[byte]): Rune =
s.mapIt(chr(it)).join().toRunes[0]
 
echo "Character Unicode UTF-8 encoding (hex)"
for uchar in UChars:
# Convert the UTF-8 string to a rune (codepoint).
var r = uchar.toRunes[0]
# Convert the rune to a sequence of bytes.
let s = r.toSeqByte
# Convert back the sequence of bytes to a rune.
r = s.toRune
# Display.
echo &"""{uchar:>5} U+{r.int.toHex(5)} {s.map(toHex).join(" ")}"""</lang>
 
{{out}}
<pre>Character Unicode UTF-8 encoding (hex)
A U+00041 41
ö U+000F6 C3 B6
Ж U+00416 D0 96
€ U+020AC E2 82 AC
𝄞 U+1D11E F0 9D 84 9E</pre>
 
=== Implementation ===
In this section, we provide two procedures to convert a Unicode code point to a UTF-8 sequence of bytes and conversely, without using the module “unicode”. We provide also a procedure to convert a sequence of bytes to a string in order to print it. The algorithm is the one used by the Go solution.
 
<lang Nim>import sequtils, strformat, strutils
 
const
 
# First byte of a 2-byte encoding starts 110 and carries 5 bits of data.
B2Lead = 0xC0 # 1100 0000
B2Mask = 0x1F # 0001 1111
 
# First byte of a 3-byte encoding starts 1110 and carries 4 bits of data.
B3Lead = 0xE0 # 1110 0000
B3Mask = 0x0F # 0000 1111
 
# First byte of a 4-byte encoding starts 11110 and carries 3 bits of data.
B4Lead = 0xF0 # 1111 0000
B4Mask = 0x07 # 0000 0111
 
# Non-first bytes start 10 and carry 6 bits of data.
MbLead = 0x80 # 1000 0000
MbMask = 0x3F # 0011 1111
 
 
type CodePoint = distinct int32
 
 
proc toUtf8(c: CodePoint): seq[byte] =
let i = int32(c)
result = if i <= 1 shl 7 - 1:
@[byte(i)]
elif i <= 1 shl 11 - 1:
@[B2Lead or byte(i shr 6),
MbLead or byte(i) and MbMask]
elif i <= 1 shl 16 - 1:
@[B3Lead or byte(i shr 12),
MbLead or byte(i shr 6) and MbMask,
MbLead or byte(i) and MbMask]
else:
@[B4Lead or byte(i shr 18),
MbLead or byte(i shr 12) and MbMask,
MbLead or byte(i shr 6) and MbMask,
MbLead or byte(i) and MbMask]
 
 
proc toCodePoint(b: seq[byte]): CodePoint =
let b0 = b[0].int32
result = CodePoint(
if b0 < 0x80: b0
elif b0 < 0xE0: (b0 and B2Mask) shl 6 or b[1].int32 and MbMask
elif b0 < 0xF0: (b0 and B3Mask) shl 12 or
(b[1].int32 and MbMask) shl 6 or b[2].int32 and MbMask
else: (b0 and B4Mask) shl 18 or (b[1].int32 and MbMask) shl 12 or
(b[2].int32 and MbMask) shl 6 or b[3].int32 and MbMask)
 
 
proc toString(s: seq[byte]): string =
s.mapIt(chr(it)).join()
 
 
const UChars = [CodePoint(0x00041),
CodePoint(0x000F6),
CodePoint(0x00416),
CodePoint(0x020AC),
CodePoint(0x1D11E)]
 
echo "Character Unicode UTF-8 encoding (hex)"
 
for uchar in UChars:
# Convert the code point to a sequence of bytes.
let s = uchar.toUtf8
# Convert back the sequence of bytes to a code point.
let c = s.toCodePoint
# Display.
echo &"""{s.toString:>5} U+{c.int.toHex(5)} {s.map(toHex).join(" ")}"""
</lang>
 
{{out}}
Same output as in the previous solution.
 
=={{header|Perl}}==
Anonymous user