UTF-8 encode and decode: Difference between revisions
Content deleted Content added
→{{header|Phix}}: added PureBasic |
|||
Line 1,023: | Line 1,023: | ||
#20AC -> {#E2,#82,#AC} -> {#20AC} |
#20AC -> {#E2,#82,#AC} -> {#20AC} |
||
#1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E} |
#1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E} |
||
</pre> |
|||
=={{header|PureBasic}}== |
|||
The encoding and decoding procedure are kept simple and designed to work with an array of 5 elements for input/output of the UTF-8 encoding for a single code point at a time. It was decided not to use a more elaborate example that would have been able to operate on a buffer to encode/decode more than one code point at a time. |
|||
<lang purebasic>#UTF8_codePointMaxByteCount = 4 ;UTF-8 encoding uses only a maximum of 4 bytes to encode a codepoint |
|||
Procedure UTF8_encode(x, Array encoded_codepoint.a(1)) ;x is codepoint to encode, the array will contain output |
|||
;Array encoded_codepoint() is used for output. |
|||
;After encode element zero holds the count of significant bytes in elements 1 to 4 |
|||
If ArraySize(encoded_codepoint()) < #UTF8_codePointMaxByteCount |
|||
ReDim encoded_codepoint.a(#UTF8_codePointMaxByteCount) |
|||
EndIf |
|||
Select x |
|||
Case 0 To $7F |
|||
encoded_codepoint(0) = 1 |
|||
encoded_codepoint(1) = x ;all 7 bits |
|||
Case $80 To $7FF |
|||
encoded_codepoint(0) = 2 |
|||
encoded_codepoint(2) = (x & %00111111) | %10000000 ;lowest 6 bits |
|||
encoded_codepoint(1) = (x >> 6) | %11000000 ;highest bits 7 -> 11 |
|||
Case $800 To $FFFF |
|||
encoded_codepoint(0) = 3 |
|||
encoded_codepoint(3) = (x & %00111111) | %10000000 ;lowest 6 bits |
|||
encoded_codepoint(2) = ((x >> 6) & %00111111) | %10000000 ;bits 7 -> 12 |
|||
encoded_codepoint(1) = (x >> 12) | %11100000 ;highest bits 13 -> 16 |
|||
Case $10000 To $10FFFF |
|||
encoded_codepoint(0) = 4 |
|||
encoded_codepoint(4) = (x & %00111111) | %10000000 ;lowest 6 bits |
|||
encoded_codepoint(3) = ((x >> 6) & %00111111) | %10000000 ;bits 7 -> 12 |
|||
encoded_codepoint(2) = ((x >> 12) & %00111111) | %10000000 ;bits 13 -> 18 |
|||
encoded_codepoint(1) = (x >> 18) | %11110000 ;highest bits 19 -> 21 |
|||
Default |
|||
encoded_codepoint(0) = 0 ;error, codepoint is not valid and can't be encoded |
|||
EndSelect |
|||
EndProcedure |
|||
Procedure UTF8_decode(Array encoded_codepoint.a(1)) |
|||
;Array encoded_codepoint() holds the UTF-8 encoding in elements 1 to 4, element zero isn't used for decoding. |
|||
Protected x = -1 ;initialzie with error value for possible improper encoding |
|||
If ArraySize(encoded_codepoint()) < #UTF8_codePointMaxByteCount |
|||
ProcedureReturn x ;Input array was not dimensioned properly. |
|||
EndIf |
|||
;Determine the number of bytes in the UTF8 encoding by looking at first byte |
|||
;and then proceeding accordingly. |
|||
Select encoded_codepoint(1) |
|||
Case %00000000 To %01111111 ;1 byte encoding |
|||
x = encoded_codepoint(1) |
|||
Case %11000000 To %11011111 ;2 byte encoding |
|||
x = (encoded_codepoint(1) & %00011111) << 6 ;last 5 bits only |
|||
x | (encoded_codepoint(2) & %00111111) |
|||
Case %11100000 To %11101111 ;3 byte encoding |
|||
x = (encoded_codepoint(1) & %00001111) << 6 ;last 4 bits only |
|||
x << 6 + (encoded_codepoint(2) & %00111111) |
|||
x << 6 + (encoded_codepoint(3) & %00111111) |
|||
Case %11110000 To %11110111 ;4 byte encoding |
|||
x = (encoded_codepoint(1) & %00000111) << 6 ;last 3 bits only |
|||
x << 6 + (encoded_codepoint(2) & %00111111) |
|||
x << 6 + (encoded_codepoint(3) & %00111111) |
|||
x << 6 + (encoded_codepoint(4) & %00111111) |
|||
EndSelect |
|||
ProcedureReturn x |
|||
EndProcedure |
|||
;helper procedure to format output for this example |
|||
Procedure.s formatOutput(c$, c, Array encoded_utf.a(1), dcp) ;character, codepooint, UTf8 encoding, decoded codepoint |
|||
Protected o$, i, encoding$ |
|||
o$ = " " + LSet(c$, 8) + LSet("U+" + RSet(Hex(c), 5, "0"), 10) |
|||
For i = 1 To encoded_utf(0) |
|||
encoding$ + RSet(Hex(encoded_utf(i)), 2, "0") + " " |
|||
Next |
|||
o$ + " " + LSet(encoding$, 11, " ") + " " + RSet(Hex(dcp), 5, "0") |
|||
ProcedureReturn o$ |
|||
EndProcedure |
|||
DataSection |
|||
;unicode code points in hex |
|||
unicode_codepoints: |
|||
Data.i 5, $41, $F6, $416, $20AC, $1D11E |
|||
;The names for these codepoints are: latin capital letter a; latin small letter o With diaeresis |
|||
;cyrillic capital letter zhe; euro sign; musical symbol g clef. |
|||
EndDataSection |
|||
;read initial unicode codepoint values |
|||
Restore unicode_codepoints |
|||
Read num_codepoints |
|||
num_codepoints - 1 |
|||
Dim codepoint(num_codepoints) |
|||
For i = 0 To num_codepoints |
|||
Read codepoint(i) |
|||
Next |
|||
;This array is used for input and output from the UTF8 encode and decode procedures. After encoding its elements |
|||
;hold the byte count of the encoding followed by the respective bytes. For decoding element zero is not used and |
|||
;elements 1 To 4 holds the bytes to be decoded. |
|||
Dim encoded_codepoint.a(#UTF8_codePointMaxByteCount) |
|||
If OpenConsole("", #PB_UTF8) |
|||
PrintN(LSet("", 11) + LSet("Unicode", 12) + LSet("UTF-8",14) + LSet("Decoded",12)) |
|||
PrintN(LSet("Character", 11) + LSet("Code Point", 12) + LSet("Encoding",14) + LSet("Code Point",12)) |
|||
PrintN(LSet("---------", 11) + LSet("----------", 12) + LSet("-----------",14) + LSet("-----------",12)) |
|||
For i = 0 To num_codepoints |
|||
UTF8_encode(codepoint(i), encoded_codepoint()) |
|||
dcp = UTF8_decode(encoded_codepoint()) ;Decoded UTF-8 encoding should match original codepoint that was encoded. |
|||
PrintN(formatOutput(Chr(codepoint(i)), codepoint(i), encoded_codepoint(), dcp)) |
|||
Next |
|||
Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input() |
|||
CloseConsole() |
|||
EndIf</lang> |
|||
Sample output: |
|||
<pre> Unicode UTF-8 Decoded |
|||
Character Code Point Encoding Code Point |
|||
--------- ---------- ----------- ----------- |
|||
A U+00041 41 00041 |
|||
ö U+000F6 C3 B6 000F6 |
|||
? U+00416 D0 96 00416 |
|||
? U+020AC E2 82 AC 800AC |
|||
? U+1D11E F0 9D 84 9E 1D11E |
|||
</pre> |
</pre> |
||