UTF-8 encode and decode: Difference between revisions

Content added Content deleted

Inline

@@ Line 1,320: / Line 1,320: @@
 {65, 246, 1046, 8364}
 </pre>
+=={{header|Nim}}==
+=== Using the standard library ===
+Nim strings are encoded in UTF-8. The natural way to deal with UTF-8 and Unicode code points consists to use the module “unicode” which provides procedures to convert from strings to sequences of code points (names “runes”) and conversely.
+For this purpose, using sequences or bytes is not natural. Here is a way to proceed using the module “unicode”.
+<lang Nim>import unicode, sequtils, strformat, strutils
+const UChars = ["\u0041", "\u00F6", "\u0416", "\u20AC", "\u{1D11E}"]
+proc toSeqByte(r: Rune): seq[byte] =
+  let s = r.toUTF8
+  result = @(s.toOpenArrayByte(0, s.high))
+proc toRune(s: seq[byte]): Rune =
+  s.mapIt(chr(it)).join().toRunes[0]
+echo "Character  Unicode  UTF-8 encoding (hex)"
+for uchar in UChars:
+  # Convert the UTF-8 string to a rune (codepoint).
+  var r = uchar.toRunes[0]
+  # Convert the rune to a sequence of bytes.
+  let s = r.toSeqByte
+  # Convert back the sequence of bytes to a rune.
+  r = s.toRune
+  # Display.
+  echo &"""{uchar:>5}      U+{r.int.toHex(5)}  {s.map(toHex).join(" ")}"""</lang>
+{{out}}
+<pre>Character  Unicode  UTF-8 encoding (hex)
+    A      U+00041  41
+    ö      U+000F6  C3 B6
+    Ж      U+00416  D0 96
+    €      U+020AC  E2 82 AC
+    𝄞      U+1D11E  F0 9D 84 9E</pre>
+=== Implementation ===
+In this section, we provide two procedures to convert a Unicode code point to a UTF-8 sequence of bytes and conversely, without using the module “unicode”. We provide also a procedure to convert a sequence of bytes to a string in order to print it. The algorithm is the one used by the Go solution.
+<lang Nim>import sequtils, strformat, strutils
+const
+  # First byte of a 2-byte encoding starts 110 and carries 5 bits of data.
+  B2Lead = 0xC0 # 1100 0000
+  B2Mask = 0x1F # 0001 1111
+  # First byte of a 3-byte encoding starts 1110 and carries 4 bits of data.
+  B3Lead = 0xE0 # 1110 0000
+  B3Mask = 0x0F # 0000 1111
+  # First byte of a 4-byte encoding starts 11110 and carries 3 bits of data.
+  B4Lead = 0xF0 # 1111 0000
+  B4Mask = 0x07 # 0000 0111
+  # Non-first bytes start 10 and carry 6 bits of data.
+  MbLead = 0x80 # 1000 0000
+  MbMask = 0x3F # 0011 1111
+type CodePoint = distinct int32
+proc toUtf8(c: CodePoint): seq[byte] =
+  let i = int32(c)
+  result = if i <= 1 shl 7 - 1:
+             @[byte(i)]
+           elif i <= 1 shl 11 - 1:
+             @[B2Lead or byte(i shr 6),
+               MbLead or byte(i) and MbMask]
+           elif i <= 1 shl  16 - 1:
+             @[B3Lead or byte(i shr 12),
+               MbLead or byte(i shr 6) and MbMask,
+               MbLead or byte(i) and MbMask]
+           else:
+             @[B4Lead or byte(i shr 18),
+               MbLead or byte(i shr 12) and MbMask,
+               MbLead or byte(i shr 6) and MbMask,
+               MbLead or byte(i) and MbMask]
+proc toCodePoint(b: seq[byte]): CodePoint =
+  let b0 = b[0].int32
+  result = CodePoint(
+    if b0 < 0x80: b0
+    elif b0 < 0xE0: (b0 and B2Mask) shl 6 or b[1].int32 and MbMask
+    elif b0 < 0xF0: (b0 and B3Mask) shl 12 or
+                    (b[1].int32 and MbMask) shl 6 or b[2].int32 and MbMask
+    else: (b0 and B4Mask) shl 18 or (b[1].int32 and MbMask) shl 12 or
+          (b[2].int32 and MbMask) shl 6 or b[3].int32 and MbMask)
+proc toString(s: seq[byte]): string =
+  s.mapIt(chr(it)).join()
+const UChars = [CodePoint(0x00041),
+                CodePoint(0x000F6),
+                CodePoint(0x00416),
+                CodePoint(0x020AC),
+                CodePoint(0x1D11E)]
+echo "Character  Unicode  UTF-8 encoding (hex)"
+for uchar in UChars:
+  # Convert the code point to a sequence of bytes.
+  let s = uchar.toUtf8
+  # Convert back the sequence of bytes to a code point.
+  let c = s.toCodePoint
+  # Display.
+  echo &"""{s.toString:>5}      U+{c.int.toHex(5)}  {s.map(toHex).join(" ")}"""
+</lang>
+{{out}}
+Same output as in the previous solution.
 =={{header|Perl}}==