UTF-8 encode and decode: Difference between revisions
Content deleted Content added
Added Kotlin |
→{{header|Go}}: add implementation |
||
Line 159: | Line 159: | ||
=={{header|Go}}== |
=={{header|Go}}== |
||
===Implementation=== |
|||
This implementation is missing all checks for invalid data and so is not production-ready, but illustrates the basic UTF-8 encoding scheme. |
|||
<lang go>package main |
|||
import ( |
|||
"bytes" |
|||
"encoding/hex" |
|||
"fmt" |
|||
"log" |
|||
"strings" |
|||
) |
|||
var testCases = []struct { |
|||
rune |
|||
string |
|||
}{ |
|||
{'A', "41"}, |
|||
{'ö', "C3 B6"}, |
|||
{'Ж', "D0 96"}, |
|||
{'€', "E2 82 AC"}, |
|||
{'𝄞', "F0 9D 84 9E"}, |
|||
} |
|||
func main() { |
|||
for _, tc := range testCases { |
|||
// derive some things from test data |
|||
u := fmt.Sprintf("U+%04X", tc.rune) |
|||
b, err := hex.DecodeString(strings.Replace(tc.string, " ", "", -1)) |
|||
if err != nil { |
|||
log.Fatal("bad test data") |
|||
} |
|||
// exercise encoder and decoder on test data |
|||
e := encodeUTF8(tc.rune) |
|||
d := decodeUTF8(b) |
|||
// show function return values |
|||
fmt.Printf("%c %-7s %X\n", d, u, e) |
|||
// validate return values against test data |
|||
if !bytes.Equal(e, b) { |
|||
log.Fatal("encodeUTF8 wrong") |
|||
} |
|||
if d != tc.rune { |
|||
log.Fatal("decodeUTF8 wrong") |
|||
} |
|||
} |
|||
} |
|||
const ( |
|||
// first byte of a 2-byte encoding starts 110 and carries 5 bits of data |
|||
b2Lead = 0xC0 // 1100 0000 |
|||
b2Mask = 0x1F // 0001 1111 |
|||
// first byte of a 3-byte encoding starts 1110 and carries 4 bits of data |
|||
b3Lead = 0xE0 // 1110 0000 |
|||
b3Mask = 0x0F // 0000 1111 |
|||
// first byte of a 4-byte encoding starts 11110 and carries 3 bits of data |
|||
b4Lead = 0xF0 // 1111 0000 |
|||
b4Mask = 0x07 // 0000 0111 |
|||
// non-first bytes start 10 and carry 6 bits of data |
|||
mbLead = 0x80 // 1000 0000 |
|||
mbMask = 0x3F // 0011 1111 |
|||
) |
|||
func encodeUTF8(r rune) []byte { |
|||
switch i := uint32(r); { |
|||
case i <= 1<<7-1: // max code point that encodes into a single byte |
|||
return []byte{byte(r)} |
|||
case i <= 1<<11-1: // into two bytes |
|||
return []byte{ |
|||
b2Lead | byte(r>>6), |
|||
mbLead | byte(r)&mbMask} |
|||
case i <= 1<<16-1: // three |
|||
return []byte{ |
|||
b3Lead | byte(r>>12), |
|||
mbLead | byte(r>>6)&mbMask, |
|||
mbLead | byte(r)&mbMask} |
|||
default: |
|||
return []byte{ |
|||
b4Lead | byte(r>>18), |
|||
mbLead | byte(r>>12)&mbMask, |
|||
mbLead | byte(r>>6)&mbMask, |
|||
mbLead | byte(r)&mbMask} |
|||
} |
|||
} |
|||
func decodeUTF8(b []byte) rune { |
|||
switch b0 := b[0]; { |
|||
case b0 < 0x80: |
|||
return rune(b0) |
|||
case b0 < 0xE0: |
|||
return rune(b0&b2Mask)<<6 | |
|||
rune(b[1]&mbMask) |
|||
case b0 < 0xF0: |
|||
return rune(b0&b3Mask)<<12 | |
|||
rune(b[1]&mbMask)<<6 | |
|||
rune(b[2]&mbMask) |
|||
default: |
|||
return rune(b0&b4Mask)<<18 | |
|||
rune(b[1]&mbMask)<<12 | |
|||
rune(b[2]&mbMask)<<6 | |
|||
rune(b[3]&mbMask) |
|||
} |
|||
}</lang> |
|||
{{out}} |
|||
<pre> |
|||
A U+0041 41 |
|||
ö U+00F6 C3B6 |
|||
Ж U+0416 D096 |
|||
€ U+20AC E282AC |
|||
𝄞 U+1D11E F09D849E |
|||
</pre> |
|||
===Library/language=== |
|||
<lang go>package main |
<lang go>package main |
||