UTF-8 encode and decode: Difference between revisions

Line 1,023:

#20AC -> {#E2,#82,#AC} -> {#20AC}

#1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E}

</pre>

=={{header|PureBasic}}==

The encoding and decoding procedure are kept simple and designed to work with an array of 5 elements for input/output of the UTF-8 encoding for a single code point at a time. It was decided not to use a more elaborate example that would have been able to operate on a buffer to encode/decode more than one code point at a time.

<lang purebasic>#UTF8_codePointMaxByteCount = 4 ;UTF-8 encoding uses only a maximum of 4 bytes to encode a codepoint

Procedure UTF8_encode(x, Array encoded_codepoint.a(1)) ;x is codepoint to encode, the array will contain output

;Array encoded_codepoint() is used for output.

;After encode element zero holds the count of significant bytes in elements 1 to 4

If ArraySize(encoded_codepoint()) < #UTF8_codePointMaxByteCount

ReDim encoded_codepoint.a(#UTF8_codePointMaxByteCount)

EndIf

Select x

Case 0 To $7F

encoded_codepoint(0) = 1

encoded_codepoint(1) = x ;all 7 bits

Case $80 To $7FF

encoded_codepoint(0) = 2

encoded_codepoint(2) = (x & %00111111) | %10000000 ;lowest 6 bits

encoded_codepoint(1) = (x >> 6) | %11000000 ;highest bits 7 -> 11

Case $800 To $FFFF

encoded_codepoint(0) = 3

encoded_codepoint(3) = (x & %00111111) | %10000000 ;lowest 6 bits

encoded_codepoint(2) = ((x >> 6) & %00111111) | %10000000 ;bits 7 -> 12

encoded_codepoint(1) = (x >> 12) | %11100000 ;highest bits 13 -> 16

Case $10000 To $10FFFF

encoded_codepoint(0) = 4

encoded_codepoint(4) = (x & %00111111) | %10000000 ;lowest 6 bits

encoded_codepoint(3) = ((x >> 6) & %00111111) | %10000000 ;bits 7 -> 12

encoded_codepoint(2) = ((x >> 12) & %00111111) | %10000000 ;bits 13 -> 18

encoded_codepoint(1) = (x >> 18) | %11110000 ;highest bits 19 -> 21

Default

encoded_codepoint(0) = 0 ;error, codepoint is not valid and can't be encoded

EndSelect

EndProcedure

Procedure UTF8_decode(Array encoded_codepoint.a(1))

;Array encoded_codepoint() holds the UTF-8 encoding in elements 1 to 4, element zero isn't used for decoding.

Protected x = -1 ;initialzie with error value for possible improper encoding

If ArraySize(encoded_codepoint()) < #UTF8_codePointMaxByteCount

ProcedureReturn x ;Input array was not dimensioned properly.

EndIf

;Determine the number of bytes in the UTF8 encoding by looking at first byte

;and then proceeding accordingly.

Select encoded_codepoint(1)

Case %00000000 To %01111111 ;1 byte encoding

x = encoded_codepoint(1)

Case %11000000 To %11011111 ;2 byte encoding

x = (encoded_codepoint(1) & %00011111) << 6 ;last 5 bits only

x | (encoded_codepoint(2) & %00111111)

Case %11100000 To %11101111 ;3 byte encoding

x = (encoded_codepoint(1) & %00001111) << 6 ;last 4 bits only

x << 6 + (encoded_codepoint(2) & %00111111)

x << 6 + (encoded_codepoint(3) & %00111111)

Case %11110000 To %11110111 ;4 byte encoding

x = (encoded_codepoint(1) & %00000111) << 6 ;last 3 bits only

x << 6 + (encoded_codepoint(2) & %00111111)

x << 6 + (encoded_codepoint(3) & %00111111)

x << 6 + (encoded_codepoint(4) & %00111111)

EndSelect

ProcedureReturn x

EndProcedure

;helper procedure to format output for this example

Procedure.s formatOutput(c$, c, Array encoded_utf.a(1), dcp) ;character, codepooint, UTf8 encoding, decoded codepoint

Protected o$, i, encoding$

o$ = " " + LSet(c$, 8) + LSet("U+" + RSet(Hex(c), 5, "0"), 10)

For i = 1 To encoded_utf(0)

encoding$ + RSet(Hex(encoded_utf(i)), 2, "0") + " "

ProcedureReturn o$

EndProcedure

DataSection

;unicode code points in hex

unicode_codepoints:

Data.i 5, $41, $F6, $416, $20AC, $1D11E

;The names for these codepoints are: latin capital letter a; latin small letter o With diaeresis

;cyrillic capital letter zhe; euro sign; musical symbol g clef.

EndDataSection

;read initial unicode codepoint values

Restore unicode_codepoints

Read num_codepoints

num_codepoints - 1

Dim codepoint(num_codepoints)

For i = 0 To num_codepoints

Read codepoint(i)

;hold the byte count of the encoding followed by the respective bytes. For decoding element zero is not used and

;elements 1 To 4 holds the bytes to be decoded.

Dim encoded_codepoint.a(#UTF8_codePointMaxByteCount)

If OpenConsole("", #PB_UTF8)

PrintN(LSet("", 11) + LSet("Unicode", 12) + LSet("UTF-8",14) + LSet("Decoded",12))

PrintN(LSet("Character", 11) + LSet("Code Point", 12) + LSet("Encoding",14) + LSet("Code Point",12))

PrintN(LSet("---------", 11) + LSet("----------", 12) + LSet("-----------",14) + LSet("-----------",12))

For i = 0 To num_codepoints

UTF8_encode(codepoint(i), encoded_codepoint())

dcp = UTF8_decode(encoded_codepoint()) ;Decoded UTF-8 encoding should match original codepoint that was encoded.

PrintN(formatOutput(Chr(codepoint(i)), codepoint(i), encoded_codepoint(), dcp))

CloseConsole()

EndIf</lang>

Sample output:

<pre> Unicode UTF-8 Decoded

Character Code Point Encoding Code Point

--------- ---------- ----------- -----------

A U+00041 41 00041

ö U+000F6 C3 B6 000F6

? U+00416 D0 96 00416

? U+020AC E2 82 AC 800AC

? U+1D11E F0 9D 84 9E 1D11E

</pre>