UTF-8 encode and decode: Difference between revisions

(→‎{{header|UNIX Shell}}: Add implementation)
(9 intermediate revisions by 7 users not shown)
Line 3:
As described in [[UTF-8]] and in [[wp:UTF-8|Wikipedia]], UTF-8 is a popular encoding of (multi-byte) [[Unicode]] code-points into eight-bit octets.
 
The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-41–4 bytes representing that character in the UTF-8 encoding.
 
Then you have to write the corresponding decoder that takes a sequence of 1-41–4 UTF-8 encoded bytes and return the corresponding unicode character.
 
Demonstrate the functionality of your encoder and decoder on the following five characters:
Line 2,519:
 
=={{header|Elena}}==
ELENA 46.x :
<syntaxhighlight lang="elena">import system'routines;
import extensions;
Line 2,532:
string printAsUTF8Array()
{
self.toByteArray().forEach::(b){ console.print(b.toString(16)," ") }
}
string printAsUTF32()
{
self.toArray().forEach::(c){ console.print("U+",c.toInt().toString(16)," ") }
}
}
Line 2,565:
€ E2 82 AC U+20AC
𝄞 F0 9D 84 9E U+1D11E</pre>
 
=={{header|FreeBASIC}}==
{{trans|VBScript}}
<syntaxhighlight lang="vbnet">Function unicode_2_utf8(x As Long) As String
Dim As String y
Dim As Long r
Select Case x
Case 0 To &H7F
y = Chr(x)
Case &H80 To &H7FF
y = Chr(192 + x \ 64) + Chr(128 + x Mod 64)
Case &H800 To &H7FFF, 32768 To 65535
r = x \ 64
y = Chr(224 + r \ 64) + Chr(128 + r Mod 64) + Chr(128 + x Mod 64)
Case &H10000 To &H10FFFF
r = x \ 4096
y = Chr(240 + r \ 64) + Chr(128 + r Mod 64) + Chr(128 + (x \ 64) Mod 64) + Chr(128 + x Mod 64)
Case Else
Print "what else? " & x & " " & Hex(x)
End Select
Return y
End Function
 
Function utf8_2_unicode(x As String) As Long
Dim As Long primero, segundo, tercero, cuarto
Dim As Long total
Select Case Len(x)
Case 1 'one byte
If Asc(x) < 128 Then
total = Asc(x)
Else
Print "highest bit set error"
End If
Case 2 'two bytes and assume primero byte is leading byte
If Asc(x) \ 32 = 6 Then
primero = Asc(x) Mod 32
If Asc(Mid(x, 2, 1)) \ 64 = 2 Then
segundo = Asc(Mid(x, 2, 1)) Mod 64
Else
Print "mask error"
End If
Else
Print "leading byte error"
End If
total = 64 * primero + segundo
Case 3 'three bytes and assume primero byte is leading byte
If Asc(x) \ 16 = 14 Then
primero = Asc(x) Mod 16
If Asc(Mid(x, 2, 1)) \ 64 = 2 Then
segundo = Asc(Mid(x, 2, 1)) Mod 64
If Asc(Mid(x, 3, 1)) \ 64 = 2 Then
tercero = Asc(Mid(x, 3, 1)) Mod 64
Else
Print "mask error last byte"
End If
Else
Print "mask error middle byte"
End If
Else
Print "leading byte error"
End If
total = 4096 * primero + 64 * segundo + tercero
Case 4 'four bytes and assume primero byte is leading byte
If Asc(x) \ 8 = 30 Then
primero = Asc(x) Mod 8
If Asc(Mid(x, 2, 1)) \ 64 = 2 Then
segundo = Asc(Mid(x, 2, 1)) Mod 64
If Asc(Mid(x, 3, 1)) \ 64 = 2 Then
tercero = Asc(Mid(x, 3, 1)) Mod 64
If Asc(Mid(x, 4, 1)) \ 64 = 2 Then
cuarto = Asc(Mid(x, 4, 1)) Mod 64
Else
Print "mask error last byte"
End If
Else
Print "mask error tercero byte"
End If
Else
Print "mask error second byte"
End If
Else
Print "mask error leading byte"
End If
total = Clng(262144 * primero + 4096 * segundo + 64 * tercero + cuarto)
Case Else
Print "more bytes than expected"
End Select
Return total
End Function
 
Dim As Long cp(4) = {65, 246, 1046, 8364, 119070} '[{&H0041,&H00F6,&H0416,&H20AC,&H1D11E}]
Dim As String r, s
Dim As integer i, j
Print "ch unicode UTF-8 encoded decoded"
For i = Lbound(cp) To Ubound(cp)
Dim As Long cpi = cp(i)
r = unicode_2_utf8(cpi)
s = Hex(cpi)
Print Chr(cpi); Space(10 - Len(s)); s;
s = ""
For j = 1 To Len(r)
s &= Hex(Asc(Mid(r, j, 1))) & " "
Next j
Print Space(16 - Len(s)); s;
s = Hex(utf8_2_unicode(r))
Print Space(8 - Len(s)); s
Next i
 
Sleep</syntaxhighlight>
 
=={{header|F_Sharp|F#}}==
Line 3,161 ⟶ 3,270:
 
=={{header|Julia}}==
Julia supports the UTF-8 encoding (and others through packages).
{{works with|Julia|0.6}}
 
<syntaxhighlight lang="julia">
Julia supports by default UTF-8 encoding.
for t in ("A", "ö", "Ж", "€", "𝄞")
 
println(t, " → ", codeunits(t))
<syntaxhighlight lang="julia">for t in ("A", "ö", "Ж", "€", "𝄞")
end
enc = Vector{UInt8}(t)
</syntaxhighlight>
dec = String(enc)
println(dec, " → ", enc)
end</syntaxhighlight>
 
{{out}}
Line 3,211 ⟶ 3,318:
 
=={{header|langur}}==
{{works with|langur|0.8.4}}
<syntaxhighlight lang="langur">writeln "character Unicode UTF-8 encoding (hex)"
 
Line 3,217 ⟶ 3,323:
val .utf8 = s2b cp2s .cp
val .cpstr = b2s .utf8
val .utf8rep = join " ", map ffn .b: $"\{{.b:X02;}}", .utf8
writeln $"\{{.cpstr:-11;}} U+\{{.cp:X04:-8;}} \{{.utf8rep;}}"
}
}</syntaxhighlight>
</syntaxhighlight>
 
{{out}}
Line 3,930 ⟶ 4,037:
writeln("-------------------------------------------------");
for ch range "AöЖ€𝄞" do
utf8 := striToUtf8toUtf8(str(ch));
writeln(ch rpad 11 <& "U+" <& ord(ch) radix 16 lpad0 4 rpad 7 <&
hex(utf8) rpad 22 <& utf8ToStrifromUtf8(utf8));
end for;
end func;</syntaxhighlight>
Line 4,265 ⟶ 4,372:
Dim h_0,h_2,h_3,h_4
Dim mc_0,mc_2,mc_3,mc_4
Dim mm_0
 
m_1=&h3F
Line 4,283 ⟶ 4,389:
mc_3=&hF
mc_4=&h7
 
mm_0=&h3f
 
 
Function cp2utf8(cp) 'cp as long, returns string
Line 4,334 ⟶ 4,437:
Dim c,c0,c1,c2,u
c=b(i):c0=pad(c(0),29) :c1=c(1) :c2=pad(c(2),12):u=cp2utf8(c1)
print c0 & " CP:" & pad("U+" & Hex(c1),-68) & " my utf8:" & utf8displ (u) & " should be:" & c2 & " back to CP:" & pad("U+" & Hex(utf82cp(u)),-68)& vbCrLf
End Sub
 
Line 4,354 ⟶ 4,457:
<small>
<pre>
LATIN CAPITAL LETTER A CP: U+41 my utf8: 41 should be: 41 back to CP: U+41
LATIN SMALL LETTER O WITH DIA CP: U+F6 my utf8: C3 B6 should be: C3 B6 back to CP: U+F6
CYRILLIC CAPITAL LETTER ZHE CP: U+416 my utf8: D0 96 should be: D0 96 back to CP: U+416
EURO SIGN CP: U+20AC my utf8: E2 82 AC should be: E2 82 AC back to CP: U+20AC
MUSICAL SYMBOL G CLEF CP: U+1D11E my utf8: F0 9D 84 9E should be: F0 9D 84 9E back to CP: U+1D11E
</pre>
</small>
 
=={{header|Wren}}==
The utf8_decode function was translated from the Go entry.
<syntaxhighlight lang="ecmascriptwren">import "./fmt" for Fmt
 
var utf8_encode = Fn.new { |cp| String.fromCodePoint(cp).bytes.toList }
1,006

edits