UTF-8 encode and decode: Difference between revisions

another take in Tcl
(another take in Tcl)
Line 401:
0x20ac {E2 82 AC} -> €
0x1d11e {EF BF BD} -> �
</pre>
 
=== Alternative Implementation ===
While perhaps not as readable as the above, this version handles beyond-BMP codepoints by manually composing the utf-8 byte sequences and emitting raw bytes to the console. <tt>encoding convertto utf-8</tt> command still does the heavy lifting where it can.
 
<lang Tcl>proc utf8 {codepoint} {
scan $codepoint %llx cp
if {$cp < 0x10000} {
set str [format \\u%04x $cp] ;# \uXXXX
set str [subst $str] ;# substitute per Tcl rules
set bytes [encoding convertto utf-8 $str] ;# encode
} else { ;# codepoints beyond the BMP need manual approach
set bits [format %021b $cp] ;# format as binary string
set unibits 11110[string range $bits 0 2];# insert extra bits for utf-8 4-byte encoding
append unibits 10[string range $bits 3 8]
append unibits 10[string range $bits 9 14]
append unibits 10[string range $bits 15 20]
set bytes [binary format B* $unibits] ;# turn into a sequence of bytes
}
return $bytes
}
 
proc hexchars {s} {
binary scan $s H* hex
regsub -all .. $hex {\0 }
}
 
# for the test, we assume the tty is in utf-8 mode and can handle beyond-BMP chars
# so set output mode to binary so we can write raw bytes!
chan configure stdout -encoding binary
foreach codepoint { 41 F6 416 20AC 1D11E } {
set utf8 [utf8 $codepoint]
puts "[format U+%04s $codepoint]\t$utf8\t[hexchars $utf8]"
}</lang>
 
{{out}}<pre>U+0041 A 41
U+00F6 ö c3 b6
U+0416 Ж d0 96
U+20AC € e2 82 ac
U+1D11E 𝄞 f0 9d 84 9e
</pre>
 
Anonymous user