Read a file character by character/UTF8: Difference between revisions
Content added Content deleted
(→{{header|Java}}: Move declaration closer to usage) |
m (→{{header|Phix}}: added syntax colouring the hard way) |
||
Line 727: | Line 727: | ||
precisely one unicode character from a file. If there is a genuine demand for it, I |
precisely one unicode character from a file. If there is a genuine demand for it, I |
||
could easily add this to that file permanently, and document/autoinclude it properly. |
could easily add this to that file permanently, and document/autoinclude it properly. |
||
<lang Phix>constant INVALID_UTF8 = #FFFD |
|||
<!--<lang Phix>--> |
|||
function get_one_utf8_char(integer fn) |
|||
<span style="color: #008080;">constant</span> <span style="color: #000000;">INVALID_UTF8</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">#FFFD</span> |
|||
-- returns INVALID_UTF8 on error, else a string of 1..4 bytes representing one character |
|||
object res |
|||
<span style="color: #008080;">function</span> <span style="color: #000000;">get_one_utf8_char</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span> |
|||
integer headb, bytes, c |
|||
<span style="color: #000080;font-style:italic;">-- returns INVALID_UTF8 on error, else a string of 1..4 bytes representing one character</span> |
|||
<span style="color: #004080;">object</span> <span style="color: #000000;">res</span> |
|||
<span style="color: #004080;">integer</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">c</span> |
|||
<span style="color: #000080;font-style:italic;">-- headb = first byte of utf-8 character:</span> |
|||
<span style="color: #000000;">headb</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">headb</span> |
|||
<span style="color: #000080;font-style:italic;">-- calculate length of utf-8 character in bytes (1..4):</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- (utf-8 starts at #0)</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b01111111</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span> <span style="color: #000080;font-style:italic;">-- 0b_0xxx_xxxx</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b10111111</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- (it's a tail byte)</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b11011111</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">2</span> <span style="color: #000080;font-style:italic;">-- 0b_110x_xxxx</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b11101111</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">3</span> <span style="color: #000080;font-style:italic;">-- 0b_1110_xxxx</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b11110100</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">4</span> <span style="color: #000080;font-style:italic;">-- 0b_1111_0xzz</span> |
|||
<span style="color: #008080;">else</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- (utf-8 ends at #10FFFF)</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #000080;font-style:italic;">-- 2..4 bytes encoding (tail range: 0b_1000_0000..0b_1011_1111);</span> |
|||
<span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">-</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span> <span style="color: #000080;font-style:italic;">-- tail bytes are valid?</span> |
|||
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><</span><span style="color: #000000;">#80</span> <span style="color: #008080;">or</span> <span style="color: #000000;">c</span><span style="color: #0000FF;">></span><span style="color: #000000;">#BF</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- invalid tail byte or eof</span> |
|||
<span style="color: #008080;">exit</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #000000;">res</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">c</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
|||
<span style="color: #000080;font-style:italic;">-- 1 byte encoding (head range: 0b_0000_0000..0b_0111_1111):</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">headb</span> <span style="color: #000080;font-style:italic;">-- UTF-8 = ASCII |
|||
-- 2 bytes encoding (head range: 0b_1100_0000..0b_1101_1111):</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">=</span><span style="color: #000000;">2</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">headb</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">#1F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#40</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b110[7..11] headb</span> |
|||
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- 0b10[1..6] tail</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;">></span><span style="color: #000000;">#7FF</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #000080;font-style:italic;">-- sanity check</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><</span><span style="color: #000000;">#80</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- long form?</span> |
|||
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">INVALID_UTF8</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #000080;font-style:italic;">-- 3 bytes encoding (head range: 0b_1110_0000..0b_1110_1111):</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">=</span><span style="color: #000000;">3</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">headb</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">#0F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#1000</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b1110[13..16] head</span> |
|||
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#40</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b10[7..12] tail</span> |
|||
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- 0b10[1..6] tail</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;">></span><span style="color: #000000;">#FFFF</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #000080;font-style:italic;">-- sanity check</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><</span><span style="color: #000000;">#800</span> <span style="color: #000080;font-style:italic;">-- long form?</span> |
|||
<span style="color: #008080;">or</span> <span style="color: #0000FF;">(</span><span style="color: #000000;">c</span><span style="color: #0000FF;">>=</span><span style="color: #000000;">#D800</span> <span style="color: #008080;">and</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">#DFFF</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- utf-16 incompatible</span> |
|||
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">INVALID_UTF8</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #000080;font-style:italic;">-- 4 bytes encoding (head range: 0b_1111_0000..0b_1111_0111):</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">=</span><span style="color: #000000;">4</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">headb</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">#07</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#040000</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b11110[19..21] head</span> |
|||
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#1000</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b10[13..18] tail</span> |
|||
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#0040</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b10[7..12] tail</span> |
|||
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">4</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- 0b10[1..6] tail</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><</span><span style="color: #000000;">#10000</span> <span style="color: #000080;font-style:italic;">-- long form?</span> |
|||
<span style="color: #008080;">or</span> <span style="color: #000000;">c</span><span style="color: #0000FF;">></span><span style="color: #000000;">#10FFFF</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">INVALID_UTF8</span> <span style="color: #000080;font-style:italic;">-- utf-8 ends at #10FFFF</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #000080;font-style:italic;">-- bytes = 0; current byte is not encoded correctly:</span> |
|||
<span style="color: #008080;">else</span> |
|||
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">INVALID_UTF8</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span> |
|||
<!--</lang>--> |
|||
Test code: |
|||
-- headb = first byte of utf-8 character: |
|||
headb = getc(fn) |
|||
if headb=-1 then return -1 end if |
|||
res = ""&headb |
|||
<!--<lang Phix>--> |
|||
-- calculate length of utf-8 character in bytes (1..4): |
|||
<span style="color: #000080;font-style:italic;">--string utf8 = "aă€⼥" -- (same results as next)</span> |
|||
if headb<0 then bytes = 0 -- (utf-8 starts at #0) |
|||
<span style="color: #004080;">string</span> <span style="color: #000000;">utf8</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">utf32_to_utf8</span><span style="color: #0000FF;">({</span><span style="color: #000000;">#0061</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0103</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#20ac</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#2f25</span><span style="color: #0000FF;">})</span> |
|||
elsif headb<=0b01111111 then bytes = 1 -- 0b_0xxx_xxxx |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"length of utf8 is %d bytes\n"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">utf8</span><span style="color: #0000FF;">))</span> |
|||
elsif headb<=0b10111111 then bytes = 0 -- (it's a tail byte) |
|||
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"test.txt"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"wb"</span><span style="color: #0000FF;">)</span> |
|||
elsif headb<=0b11011111 then bytes = 2 -- 0b_110x_xxxx |
|||
<span style="color: #7060A8;">puts</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">,</span><span style="color: #000000;">utf8</span><span style="color: #0000FF;">)</span> |
|||
elsif headb<=0b11101111 then bytes = 3 -- 0b_1110_xxxx |
|||
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span> |
|||
elsif headb<=0b11110100 then bytes = 4 -- 0b_1111_0xzz |
|||
<span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"test.txt"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"r"</span><span style="color: #0000FF;">)</span> |
|||
else bytes = 0 -- (utf-8 ends at #10FFFF) |
|||
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">5</span> <span style="color: #008080;">do</span> |
|||
end if |
|||
<span style="color: #004080;">object</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_one_utf8_char</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #004080;">string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #008080;">if</span> <span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #000000;">LINUX</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"char %d (%s) is %d bytes\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)})</span> |
|||
<span style="color: #008080;">else</span> |
|||
<span style="color: #000080;font-style:italic;">-- unicode and consoles tricky on windows, so I'm |
|||
-- just avoiding that issue altogther (t)here.</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"char %d is %d bytes\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)})</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #008080;">elsif</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"char %d - EOF\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">i</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">exit</span> |
|||
<span style="color: #008080;">else</span> |
|||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"char %d - INVALID_UTF8\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">i</span><span style="color: #0000FF;">)</span> |
|||
<span style="color: #008080;">exit</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span> |
|||
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span> |
|||
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span> |
|||
<!--</lang>--> |
|||
-- 2..4 bytes encoding (tail range: 0b_1000_0000..0b_1011_1111); |
|||
for j=1 to bytes-1 do -- tail bytes are valid? |
|||
c = getc(fn) |
|||
if c<#80 or c>#BF then |
|||
bytes = 0 -- invalid tail byte or eof |
|||
exit |
|||
end if |
|||
res &= c |
|||
end for |
|||
-- 1 byte encoding (head range: 0b_0000_0000..0b_0111_1111): |
|||
if bytes=1 then |
|||
c = headb -- UTF-8 = ASCII |
|||
-- 2 bytes encoding (head range: 0b_1100_0000..0b_1101_1111): |
|||
elsif bytes=2 then |
|||
c = and_bits(headb, #1F)*#40 + -- 0b110[7..11] headb |
|||
and_bits(res[2], #3F) -- 0b10[1..6] tail |
|||
if c>#7FF then ?9/0 end if -- sanity check |
|||
if c<#80 then -- long form? |
|||
res = INVALID_UTF8 |
|||
end if |
|||
-- 3 bytes encoding (head range: 0b_1110_0000..0b_1110_1111): |
|||
elsif bytes=3 then |
|||
c = and_bits(headb, #0F)*#1000 + -- 0b1110[13..16] head |
|||
and_bits(res[2], #3F)*#40 + -- 0b10[7..12] tail |
|||
and_bits(res[3], #3F) -- 0b10[1..6] tail |
|||
if c>#FFFF then ?9/0 end if -- sanity check |
|||
if c<#800 -- long form? |
|||
or (c>=#D800 and c<=#DFFF) then -- utf-16 incompatible |
|||
res = INVALID_UTF8 |
|||
end if |
|||
-- 4 bytes encoding (head range: 0b_1111_0000..0b_1111_0111): |
|||
elsif bytes=4 then |
|||
c = and_bits(headb, #07)*#040000 + -- 0b11110[19..21] head |
|||
and_bits(res[2], #3F)*#1000 + -- 0b10[13..18] tail |
|||
and_bits(res[3], #3F)*#0040 + -- 0b10[7..12] tail |
|||
and_bits(res[4], #3F) -- 0b10[1..6] tail |
|||
if c<#10000 -- long form? |
|||
or c>#10FFFF then |
|||
res = INVALID_UTF8 -- utf-8 ends at #10FFFF |
|||
end if |
|||
-- bytes = 0; current byte is not encoded correctly: |
|||
else |
|||
res = INVALID_UTF8 |
|||
end if |
|||
return res |
|||
end function</lang> |
|||
Test code: |
|||
<lang Phix>--string utf8 = "aă€⼥" -- (same results as next) |
|||
string utf8 = utf32_to_utf8({#0061,#0103,#20ac,#2f25}) |
|||
printf(1,"length of utf8 is %d bytes\n",length(utf8)) |
|||
integer fn = open("test.txt","wb") |
|||
puts(fn,utf8) |
|||
close(fn) |
|||
fn = open("test.txt","r") |
|||
for i=1 to 5 do |
|||
object res = get_one_utf8_char(fn) |
|||
if string(res) then |
|||
if platform()=LINUX then |
|||
printf(1,"char %d (%s) is %d bytes\n",{i,res,length(res)}) |
|||
else |
|||
-- unicode and consoles tricky on windows, so I'm |
|||
-- just avoiding that issue altogther (t)here. |
|||
printf(1,"char %d is %d bytes\n",{i,length(res)}) |
|||
end if |
|||
elsif res=-1 then |
|||
printf(1,"char %d - EOF\n",i) |
|||
exit |
|||
else |
|||
printf(1,"char %d - INVALID_UTF8\n",i) |
|||
exit |
|||
end if |
|||
end for |
|||
close(fn)</lang> |
|||
{{out}} |
{{out}} |
||
<pre> |
<pre> |