UTF-8 encode and decode: Difference between revisions

(→‎{{header|Vlang}}: Rename "Vlang" in "V (Vlang)")
 
(12 intermediate revisions by 8 users not shown)
Line 3:
As described in [[UTF-8]] and in [[wp:UTF-8|Wikipedia]], UTF-8 is a popular encoding of (multi-byte) [[Unicode]] code-points into eight-bit octets.
 
The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-41–4 bytes representing that character in the UTF-8 encoding.
 
Then you have to write the corresponding decoder that takes a sequence of 1-41–4 UTF-8 encoded bytes and return the corresponding unicode character.
 
Demonstrate the functionality of your encoder and decoder on the following five characters:
Line 2,519:
 
=={{header|Elena}}==
ELENA 46.x :
<syntaxhighlight lang="elena">import system'routines;
import extensions;
Line 2,532:
string printAsUTF8Array()
{
self.toByteArray().forEach::(b){ console.print(b.toString(16)," ") }
}
string printAsUTF32()
{
self.toArray().forEach::(c){ console.print("U+",c.toInt().toString(16)," ") }
}
}
Line 2,565:
€ E2 82 AC U+20AC
𝄞 F0 9D 84 9E U+1D11E</pre>
 
=={{header|FreeBASIC}}==
{{trans|VBScript}}
<syntaxhighlight lang="vbnet">Function unicode_2_utf8(x As Long) As String
Dim As String y
Dim As Long r
Select Case x
Case 0 To &H7F
y = Chr(x)
Case &H80 To &H7FF
y = Chr(192 + x \ 64) + Chr(128 + x Mod 64)
Case &H800 To &H7FFF, 32768 To 65535
r = x \ 64
y = Chr(224 + r \ 64) + Chr(128 + r Mod 64) + Chr(128 + x Mod 64)
Case &H10000 To &H10FFFF
r = x \ 4096
y = Chr(240 + r \ 64) + Chr(128 + r Mod 64) + Chr(128 + (x \ 64) Mod 64) + Chr(128 + x Mod 64)
Case Else
Print "what else? " & x & " " & Hex(x)
End Select
Return y
End Function
 
Function utf8_2_unicode(x As String) As Long
Dim As Long primero, segundo, tercero, cuarto
Dim As Long total
Select Case Len(x)
Case 1 'one byte
If Asc(x) < 128 Then
total = Asc(x)
Else
Print "highest bit set error"
End If
Case 2 'two bytes and assume primero byte is leading byte
If Asc(x) \ 32 = 6 Then
primero = Asc(x) Mod 32
If Asc(Mid(x, 2, 1)) \ 64 = 2 Then
segundo = Asc(Mid(x, 2, 1)) Mod 64
Else
Print "mask error"
End If
Else
Print "leading byte error"
End If
total = 64 * primero + segundo
Case 3 'three bytes and assume primero byte is leading byte
If Asc(x) \ 16 = 14 Then
primero = Asc(x) Mod 16
If Asc(Mid(x, 2, 1)) \ 64 = 2 Then
segundo = Asc(Mid(x, 2, 1)) Mod 64
If Asc(Mid(x, 3, 1)) \ 64 = 2 Then
tercero = Asc(Mid(x, 3, 1)) Mod 64
Else
Print "mask error last byte"
End If
Else
Print "mask error middle byte"
End If
Else
Print "leading byte error"
End If
total = 4096 * primero + 64 * segundo + tercero
Case 4 'four bytes and assume primero byte is leading byte
If Asc(x) \ 8 = 30 Then
primero = Asc(x) Mod 8
If Asc(Mid(x, 2, 1)) \ 64 = 2 Then
segundo = Asc(Mid(x, 2, 1)) Mod 64
If Asc(Mid(x, 3, 1)) \ 64 = 2 Then
tercero = Asc(Mid(x, 3, 1)) Mod 64
If Asc(Mid(x, 4, 1)) \ 64 = 2 Then
cuarto = Asc(Mid(x, 4, 1)) Mod 64
Else
Print "mask error last byte"
End If
Else
Print "mask error tercero byte"
End If
Else
Print "mask error second byte"
End If
Else
Print "mask error leading byte"
End If
total = Clng(262144 * primero + 4096 * segundo + 64 * tercero + cuarto)
Case Else
Print "more bytes than expected"
End Select
Return total
End Function
 
Dim As Long cp(4) = {65, 246, 1046, 8364, 119070} '[{&H0041,&H00F6,&H0416,&H20AC,&H1D11E}]
Dim As String r, s
Dim As integer i, j
Print "ch unicode UTF-8 encoded decoded"
For i = Lbound(cp) To Ubound(cp)
Dim As Long cpi = cp(i)
r = unicode_2_utf8(cpi)
s = Hex(cpi)
Print Chr(cpi); Space(10 - Len(s)); s;
s = ""
For j = 1 To Len(r)
s &= Hex(Asc(Mid(r, j, 1))) & " "
Next j
Print Space(16 - Len(s)); s;
s = Hex(utf8_2_unicode(r))
Print Space(8 - Len(s)); s
Next i
 
Sleep</syntaxhighlight>
 
=={{header|F_Sharp|F#}}==
Line 3,161 ⟶ 3,270:
 
=={{header|Julia}}==
Julia supports the UTF-8 encoding (and others through packages).
{{works with|Julia|0.6}}
 
<syntaxhighlight lang="julia">
Julia supports by default UTF-8 encoding.
for t in ("A", "ö", "Ж", "€", "𝄞")
 
println(t, " → ", codeunits(t))
<syntaxhighlight lang="julia">for t in ("A", "ö", "Ж", "€", "𝄞")
end
enc = Vector{UInt8}(t)
</syntaxhighlight>
dec = String(enc)
println(dec, " → ", enc)
end</syntaxhighlight>
 
{{out}}
Line 3,211 ⟶ 3,318:
 
=={{header|langur}}==
<syntaxhighlight lang="langur">
{{works with|langur|0.8.4}}
<syntaxhighlight lang="langur">writeln "character Unicode UTF-8 encoding (hex)"
 
for .cp in "AöЖ€𝄞" {
val .utf8 = s2b cp -> cp2s .cp-> s2b
val .cpstr = b2s .utf8 -> b2s
val .utf8rep = join (" ", map(fn f $b:"\.{{b:X02;}}", .utf8))
writeln $"\.{{cpstr:-11;}} U+\.{{cp:X04:-8;}} \.{{utf8rep;}}"
}
}</syntaxhighlight>
</syntaxhighlight>
 
{{out}}
Line 3,930 ⟶ 4,038:
writeln("-------------------------------------------------");
for ch range "AöЖ€𝄞" do
utf8 := striToUtf8toUtf8(str(ch));
writeln(ch rpad 11 <& "U+" <& ord(ch) radix 16 lpad0 4 rpad 7 <&
hex(utf8) rpad 22 <& utf8ToStrifromUtf8(utf8));
end for;
end func;</syntaxhighlight>
Line 4,072 ⟶ 4,180:
U+1D11E 𝄞 f0 9d 84 9e
</pre>
 
=={{header|UNIX Shell}}==
{{works with|Bourne Again SHell}}
{{works with|Korn Shell}}
{{works with|Zsh}}
 
Works with locale set to UTF-8.
 
<syntaxhighlight lang="bash">function encode {
typeset -i code_point=$1
printf "$(printf '\\U%08X\\n' "$code_point")"
}
function decode {
typeset character=$1
printf 'U+%04X\n' "'$character"
set +x
}
printf 'Char\tCode Point\tUTF-8 Bytes\n'
for test in A ö Ж € 𝄞; do
code_point=$(decode "$test")
utf8=$(encode "$(( 16#${code_point#U+} ))")
bytes=$(printf '%b' "$utf8" | od -An -tx1 | sed -nE '/./s/^ *| *$//p')
printf '%-4b\t%-10s\t%s\n' "$utf8" "$code_point" "$bytes"
done</syntaxhighlight>
 
{{Out}}
<pre style="font-family: Consolas,Courier,monospace">Char Code Point UTF-8 Bytes
A U+0041 41
ö U+00F6 c3 b6
Ж U+0416 d0 96
€ U+20AC e2 82 ac
𝄞 U+1D11E f0 9d 84 9e</pre>
 
=={{header|VBA}}==
Line 4,225 ⟶ 4,365:
𝄞 U+1D11E f09d849e 𝄞
</pre>
 
=={{header|VBScript}}==
<syntaxhighlight lang="vb">
Option Explicit
Dim m_1,m_2,m_3,m_4
Dim d_2,d_3,d_4
Dim h_0,h_2,h_3,h_4
Dim mc_0,mc_2,mc_3,mc_4
 
m_1=&h3F
d_2=m_1+1
m_2=m_1 * d_2
d_3= (m_2 Or m_1)+1
m_3= m_2* d_2
d_4=(m_3 Or m_2 Or m_1)+1
 
h_0=&h80
h_2=&hC0
h_3=&hE0
h_4=&hF0
 
mc_0=&h3f
mc_2=&h1F
mc_3=&hF
mc_4=&h7
 
Function cp2utf8(cp) 'cp as long, returns string
If cp<&h80 Then
cp2utf8=Chr(cp)
ElseIf (cp <=&H7FF) Then
cp2utf8=Chr(h_2 or (cp \ d_2) )&Chr(h_0 Or (cp And m_1))
ElseIf (cp <=&Hffff&) Then
cp2utf8= Chr(h_3 Or (cp\ d_3)) & Chr(h_0 Or (cp And m_2)\d_2) & Chr(h_0 Or (cp And m_1))
Else
cp2utf8= Chr(h_4 Or (cp\d_4))& Chr(h_0 Or ((cp And m_3) \d_3))& Chr(h_0 Or ((cp And m_2)\d_2)) & Chr(h_0 Or (cp And m_1))
End if
End Function
 
Function utf82cp(utf) 'utf as string, returns long
Dim a,b,m
m=strreverse(utf)
b= Len(utf)
a=asc(mid(m,1,1))
utf82cp=a And &h7f
if b=1 Then Exit Function
a=asc(mid(m,2,1))
If b=2 Then utf82cp= utf82cp Or (a And mc_2)*d_2 :Exit function
utf82cp= utf82cp Or (a And m_1)*d_2
a=asc(mid(m,3,1))
If b=3 Then utf82cp= utf82cp Or (a And mc_3)*d_3 :Exit function
utf82cp= utf82cp Or (a And m_1)*d_3 Or (a=asc(mid(m,4,1)) And mc_4)*d_4
End Function
 
Sub print(s):
On Error Resume Next
WScript.stdout.Write (s)
If err= &h80070006& Then WScript.Echo " Please run this script with CScript": WScript.quit
End Sub
Function utf8displ(utf)
Dim s,i
s=""
For i=1 To Len(utf)
s=s &" "& Hex(Asc(Mid(utf,i,1)))
Next
utf8displ= pad(s,12)
End function
 
function pad(s,n) if n<0 then pad= right(space(-n) & s ,-n) else pad= left(s& space(n),n) end if :end function
 
Sub check(i)
Dim c,c0,c1,c2,u
c=b(i):c0=pad(c(0),29) :c1=c(1) :c2=pad(c(2),12):u=cp2utf8(c1)
print c0 & " CP:" & pad("U+" & Hex(c1),-8) & " my utf8:" & utf8displ (u) & " should be:" & c2 & " back to CP:" & pad("U+" & Hex(utf82cp(u)),-8)& vbCrLf
End Sub
 
Dim b
b=Array(_
Array("LATIN CAPITAL LETTER A ",&h41," 41"),_
Array("LATIN SMALL LETTER O WITH DIAERESIS ",&hF6," C3 B6"),_
Array("CYRILLIC CAPITAL LETTER ZHE ",&h416," D0 96"),_
Array("EURO SIGN",&h20AC," E2 82 AC "),_
Array("MUSICAL SYMBOL G CLEF ",&h1D11E," F0 9D 84 9E"))
 
check 0
check 1
check 2
check 3
check 4
</syntaxhighlight>
{{out}}
<small>
<pre>
LATIN CAPITAL LETTER A CP: U+41 my utf8: 41 should be: 41 back to CP: U+41
LATIN SMALL LETTER O WITH DIA CP: U+F6 my utf8: C3 B6 should be: C3 B6 back to CP: U+F6
CYRILLIC CAPITAL LETTER ZHE CP: U+416 my utf8: D0 96 should be: D0 96 back to CP: U+416
EURO SIGN CP: U+20AC my utf8: E2 82 AC should be: E2 82 AC back to CP: U+20AC
MUSICAL SYMBOL G CLEF CP: U+1D11E my utf8: F0 9D 84 9E should be: F0 9D 84 9E back to CP: U+1D11E
</pre>
</small>
 
=={{header|Wren}}==
The utf8_decode function was translated from the Go entry.
<syntaxhighlight lang="ecmascriptwren">import "./fmt" for Fmt
 
var utf8_encode = Fn.new { |cp| String.fromCodePoint(cp).bytes.toList }
1,006

edits