Read a file character by character/UTF8: Difference between revisions

m
→‎{{header|Wren}}: Changed to Wren S/H
(Added Wren)
m (→‎{{header|Wren}}: Changed to Wren S/H)
 
(32 intermediate revisions by 18 users not shown)
Line 11:
*   [[Read a file line by line]]
<br><br>
 
=={{header|Action!}}==
<syntaxhighlight lang="action!">byte X
Proc Main()
Open (1,"D:FILENAME.TXT",4,0)
Do
X=GetD(1)
Put(X)
Until EOF(1)
Od
Close(1)
Return</syntaxhighlight>
 
=={{header|AutoHotkey}}==
{{works with|AutoHotkey 1.1}}
<langsyntaxhighlight AutoHotkeylang="autohotkey">File := FileOpen("input.txt", "r")
while !File.AtEOF
MsgBox, % File.Read(1)</langsyntaxhighlight>
 
 
=={{header|BASIC256}}==
<syntaxhighlight lang="basic256">f = freefile
filename$ = "file.txt"
 
open f, filename$
 
while not eof(f)
print chr(readbyte(f));
end while
close f
end</syntaxhighlight>
 
=={{header|C}}==
<langsyntaxhighlight Clang="c">#include <stdio.h>
#include <wchar.h>
#include <stdlib.h>
Line 39 ⟶ 67:
 
return EXIT_SUCCESS;
}</langsyntaxhighlight>
 
=={{header|C++}}==
<syntaxhighlight lang="cpp">
#include <fstream>
#include <iostream>
#include <locale>
 
using namespace std;
int main(void)
{
/* If your native locale doesn't use UTF-8 encoding
* you need to replace the empty string with a
* locale like "en_US.utf8"
*/
std::locale::global(std::locale("")); // for C++
std::cout.imbue(std::locale());
ifstream in("input.txt");
wchar_t c;
while ((c = in.get()) != in.eof())
wcout<<c;
in.close();
return EXIT_SUCCESS;
}
</syntaxhighlight>
 
=={{header|C sharp|C#}}==
<langsyntaxhighlight lang="csharp">using System;
using System.IO;
using System.Text;
Line 69 ⟶ 124:
}
}
}</langsyntaxhighlight>
 
=={{header|Common Lisp}}==
{{works with|CLISP}}{{works with|Clozure CL}}{{works with|CMUCL}}{{works with|ECL (Lisp)}}{{works with|SBCL}}{{works with|ABCL}}
 
<langsyntaxhighlight lang="lisp">;; CLISP puts the external formats into a separate package
#+clisp (import 'charset:utf-8 'keyword)
 
Line 80 ⟶ 135:
(loop for c = (read-char s nil)
while c
do (format t "~a" c)))</langsyntaxhighlight>
 
=={{header|Crystal}}==
Line 87 ⟶ 142:
The encoding is UTF-8 by default, but it can be explicitly specified.
 
<langsyntaxhighlight lang="ruby">File.open("input.txt") do |file|
file.each_char { |c| p c }
end</langsyntaxhighlight>
 
or
 
<langsyntaxhighlight lang="ruby">File.open("input.txt") do |file|
while c = file.read_char
p c
end
end</langsyntaxhighlight>
=={{header|Delphi}}==
{{libheader| System.SysUtils}}
{{libheader| System.Classes}}
{{Trans|C#}}
<syntaxhighlight lang="delphi">
program Read_a_file_character_by_character_UTF8;
 
{$APPTYPE CONSOLE}
 
uses
System.SysUtils,
System.Classes;
 
function GetNextCharacter(StreamReader: TStreamReader): char;
begin
Result := chr(StreamReader.Read);
end;
 
const
FileName: TFileName = 'input.txt';
 
begin
if not FileExists(FileName) then
raise Exception.Create('Error: File not exist.');
 
var F := TStreamReader.Create(FileName, TEncoding.UTF8);
 
while not F.EndOfStream do
begin
var c := GetNextCharacter(F);
write(c);
end;
readln;
end.</syntaxhighlight>
 
=={{header|Déjà Vu}}==
 
<langsyntaxhighlight lang="dejavu">#helper function that deals with non-ASCII code points
local (read-utf8-char) file tmp:
!read-byte file
if = :eof dup:
drop
raise :unicode-error
resize-blob tmp ++ dup len tmp
set-to tmp
try:
return !decode!utf-8 tmp
catch unicode-error:
if < 3 len tmp:
raise :unicode-error
(read-utf8-char) file tmp
 
#reader function
read-utf8-char file:
!read-byte file
if = :eof dup:
return
local :tmp make-blob 1
set-to tmp 0
try:
return !decode!utf-8 tmp
catch unicode-error:
(read-utf8-char) file tmp
 
#if the module is used as a script, read from the file "input.txt",
#showing each code point separately
if = (name) :(main):
local :file !open :read "input.txt"
 
while true:
read-utf8-char file
if = :eof dup:
drop
!close file
return
!.</syntaxhighlight>
!.</lang>
 
=={{header|Factor}}==
<syntaxhighlight lang="text">USING: kernel io io.encodings.utf8 io.files strings ;
IN: rosetta-code.read-one
 
"input.txt" utf8 [
[ read1 dup ] [ 1string write ] while drop
] with-file-reader</langsyntaxhighlight>
 
 
=={{header|FreeBASIC}}==
<syntaxhighlight lang="freebasic">Dim As Long f
f = Freefile
 
Dim As String filename = "file.txt"
Dim As String*1 txt
 
Open filename For Binary As #f
While Not Eof(f)
txt = String(Lof(f), 0)
Get #f, , txt
Print txt;
Wend
Close #f
Sleep</syntaxhighlight>
 
=={{header|FunL}}==
<langsyntaxhighlight lang="funl">import io.{InputStreamReader, FileInputStream}
 
r = InputStreamReader( FileInputStream('input.txt'), 'UTF-8' )
Line 156 ⟶ 262:
while (ch = r.read()) != -1
print( chr(ch) )
r.close()</langsyntaxhighlight>
 
=={{header|Go}}==
<langsyntaxhighlight lang="go">package main
 
import (
"bufio"
"fmt"
"io"
"os"
)
 
func Runer(r io.RuneReader) func() (rune, error) {
return func() (r rune, err error) {
r, _, err = r.ReadRune()
return
}
}
}
 
func main() {
runes := Runer(bufio.NewReader(os.Stdin))
for r, err := runes(); err != nil; r,err = runes() {
fmt.Printf("%c", r)
}
}
}</langsyntaxhighlight>
 
=={{header|Haskell}}==
 
{{Works with|GHC|7.8.3}}
 
<syntaxhighlight lang="haskell">#!/usr/bin/env runhaskell
 
{- The procedure to read a UTF-8 character is just:
 
hGetChar :: Handle -> IO Char
 
assuming that the encoding for the handle has been set to utf8.
-}
 
import System.Environment (getArgs)
import System.IO (
Handle, IOMode (..),
hGetChar, hIsEOF, hSetEncoding, stdin, utf8, withFile
)
import Control.Monad (forM_, unless)
import Text.Printf (printf)
import Data.Char (ord)
 
processCharacters :: Handle -> IO ()
processCharacters h = do
done <- hIsEOF h
unless done $ do
c <- hGetChar h
putStrLn $ printf "U+%04X" (ord c)
processCharacters h
 
processOneFile :: Handle -> IO ()
processOneFile h = do
hSetEncoding h utf8
processCharacters h
 
{- You can specify one or more files on the command line, or if no
files are specified, it reads from standard input.
-}
main :: IO ()
main = do
args <- getArgs
case args of
[] -> processOneFile stdin
xs -> forM_ xs $ \name -> do
putStrLn name
withFile name ReadMode processOneFile</syntaxhighlight>
{{out}}
<pre>
bash$ echo €50 | ./read-char-utf8.hs
U+20AC
U+0035
U+0030
U+000A
</pre>
 
=={{header|J}}==
Line 189 ⟶ 350:
First, we know that the first 8-bit value in a utf-8 sequence tells us the length of the sequence needed to represent that character. Specifically: we can convert that value to binary, and count the number of leading 1s to find the length of the character (except the length is always at least 1 character long).
 
<langsyntaxhighlight Jlang="j">u8len=: 1 >. 0 i.~ (8#2)#:a.&i.</langsyntaxhighlight>
 
So now, we can use indexed file read to read a utf-8 character starting at a specific file index. What we do is read the first octet and then read as many additional characters as we need based on whatever we started with. If that's not possible, we will return EOF:
 
<langsyntaxhighlight Jlang="j">indexedread1u8=:4 :0
try.
octet0=. 1!:11 y;x,1
Line 200 ⟶ 361:
'EOF'
end.
)</langsyntaxhighlight>
 
The length of the result tells us what to add to the file index to find the next available file index for reading.
Line 209 ⟶ 370:
 
=={{header|Java}}==
The ''FileReader'' class offers a ''read'' method which will return the integer value of each character, upon each call.<br />
{{incorrect|Java|Maybe overengineered? See Kotlin}}
When the end of the stream is reached, -1 is returned.<br />
{{trans|NetRexx}}
You can implement this task by enclosing a ''FileReader'' within a class, and generating a new character via a method return.
{{works with|Java|1.7}}
<syntaxhighlight lang ="java">import java.io.*;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
 
public class RUTF8CharacterReaderProgram {
private Stringfinal FileReader slurpedreader;
 
private String encoding;
public Program(String path) throws IOException {
private String fName;
reader = new FileReader(path, StandardCharsets.UTF_16);
private File fFile;
// ---------------------------------------------------------------------------
public String slurpChars(String fileName) {
StringBuilder slrp = new StringBuilder();
fName = fileName;
fFile = new File(fName);
try (Reader fr = new FileReader(fFile)) {
encoding = ((InputStreamReader) fr).getEncoding();
forever: for (;;) {
int ic;
if ((ic = fr.read()) < 0) { break forever; }
char cc = (char) ic;
slrp.append(cc);
}
}
 
catch (FileNotFoundException ex) {
/** @return integer value from 0 to 0xffff, or -1 for EOS */
ex.printStackTrace();
public int nextCharacter() throws IOException {
return reader.read();
}
 
catch (IOException ex) {
public void close() throws IOException {
ex.printStackTrace();
reader.close();
}
slurped = slrp.length() > 0 ? slrp.toString() : null;
return slurped;
}
// ---------------------------------------------------------------------------
public void encodingDetails() {
String FMT_000 = "file_encoding=\"%s\" file_name=\"%s\"%n";
String FMT_001 = "unicode_string_length=\"%d\" code_point_count=\"%d\" string=\"%s\"%n";
String FMT_002 = "codepoint_index=\"%03d\" character_count=\"%d\" unicode_id=\"U+%05X\" hex=\"%#08x\" dec=\"%07d\" oct=\"%07o\" string=\"%s\" utf-16=\"%s\" utf-8=\"%s\" character_name=\"%s\"%n";
String str = slurped;
System.out.printf(FMT_000, encoding, fFile.getAbsoluteFile());
System.out.printf(FMT_001, str.length(), Character.codePointCount(str, 0, str.length()), str);
for (int ix = 0; ix < str.length(); ++ix) {
int cp = Character.codePointAt(str, ix);
int cc = Character.charCount(cp);
String cpName = Character.getName(cp);
String x_utf16;
String x_utf8;
x_utf16 = "";
x_utf8 = "";
try {
x_utf16 = codePointToUTF16(cp);
x_utf8 = codePointToUTF8(cp);
}
catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
}
System.out.printf(FMT_002, ix, cc, cp, cp, ((long) cp & 0x00000000ffffffff), cp, new String(Character.toChars(cp)), x_utf16, x_utf8, cpName);
if (cc > 1) {
int[] surrogates = { (int) Character.highSurrogate(cp), (int) Character.lowSurrogate(cp), };
int ixx = ix++;
for (int sp : surrogates) {
String spName = Character.getName(sp);
x_utf16 = "";
x_utf8 = "";
try {
x_utf16 = codePointToUTF16(sp);
x_utf8 = codePointToUTF8(sp);
}
catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
}
int sc = Character.charCount(sp);
System.out.printf(FMT_002, ixx++, sc, sp, sp, ((long) sp & 0x00000000ffffffff), sp, new String(Character.toChars(sp)), x_utf16, x_utf8, spName);
}
}
}
return;
}
// ---------------------------------------------------------------------------
public static String codePointToUTF8(int cp) throws UnsupportedEncodingException {
String scp = new String(Character.toChars(cp));
boolean comma = false;
StringBuilder xparts = new StringBuilder();
byte[] b_utf8 = scp.getBytes("UTF-8");
for (int xx = 0; xx < b_utf8.length; ++xx) {
if (comma) { xparts.append(','); }
xparts.append(String.format("%02x", b_utf8[xx]));
comma = true;
}
return xparts.toString();
}
// ---------------------------------------------------------------------------
public static String codePointToUTF16(int cp) throws UnsupportedEncodingException {
String scp = new String(Character.toChars(cp));
StringBuilder xparts = new StringBuilder();
byte[] b_utf16 = scp.getBytes("UTF-16BE");
boolean comma = false;
for (int xx = 0; xx < b_utf16.length; xx += 2) {
if (comma) { xparts.append(','); }
xparts.append(String.format("%02x%02x", b_utf16[xx], b_utf16[xx + 1]));
comma = true;
}
return xparts.toString();
}
// ---------------------------------------------------------------------------
public static void main(String[] args) {
String inFile;
if (args.length > 0 && args[0].length() > 0) { inFile = args[0]; }
else { inFile = "./data/utf8-001.txt"; }
RUTF8CharacterReader lcl = new RUTF8CharacterReader();
lcl.slurpChars(inFile);
lcl.encodingDetails();
return;
}
}
</syntaxhighlight>
</lang>
 
{{out}}
===Using Java 11===
<syntaxhighlight lang="java">
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
 
public final class ReadFileByCharacter {
public static void main(String[] aArgs) {
Path path = Path.of("input.txt");
try ( BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8) ) {
int value;
while ( ( value = reader.read() ) != END_OF_STREAM ) {
System.out.println((char) value);
}
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
private static final int END_OF_STREAM = -1;
 
}
</syntaxhighlight>
{{ out }}
<pre>
R
file_encoding="UTF8" file_name="/Users/RosettaCode/java/./data/utf8-001.txt"
o
unicode_string_length="10" code_point_count="8" string="y䮀𝄞𝄢12"
s
codepoint_index="000" character_count="1" unicode_id="U+00079" hex="0x000079" dec="0000121" oct="0000171" string="y" utf-16="0079" utf-8="79" character_name="LATIN SMALL LETTER Y"
e
codepoint_index="001" character_count="1" unicode_id="U+000E4" hex="0x0000e4" dec="0000228" oct="0000344" string="ä" utf-16="00e4" utf-8="c3,a4" character_name="LATIN SMALL LETTER A WITH DIAERESIS"
t
codepoint_index="002" character_count="1" unicode_id="U+000AE" hex="0x0000ae" dec="0000174" oct="0000256" string="®" utf-16="00ae" utf-8="c2,ae" character_name="REGISTERED SIGN"
t
codepoint_index="003" character_count="1" unicode_id="U+020AC" hex="0x0020ac" dec="0008364" oct="0020254" string="€" utf-16="20ac" utf-8="e2,82,ac" character_name="EURO SIGN"
a
codepoint_index="004" character_count="2" unicode_id="U+1D11E" hex="0x01d11e" dec="0119070" oct="0350436" string="𝄞" utf-16="d834,dd1e" utf-8="f0,9d,84,9e" character_name="MUSICAL SYMBOL G CLEF"
codepoint_index="004" character_count="1" unicode_id="U+0D834" hex="0x00d834" dec="0055348" oct="0154064" string="?" utf-16="fffd" utf-8="3f" character_name="HIGH SURROGATES D834"
codepoint_index="005" character_count="1" unicode_id="U+0DD1E" hex="0x00dd1e" dec="0056606" oct="0156436" string="?" utf-16="fffd" utf-8="3f" character_name="LOW SURROGATES DD1E"
codepoint_index="006" character_count="2" unicode_id="U+1D122" hex="0x01d122" dec="0119074" oct="0350442" string="𝄢" utf-16="d834,dd22" utf-8="f0,9d,84,a2" character_name="MUSICAL SYMBOL F CLEF"
codepoint_index="006" character_count="1" unicode_id="U+0D834" hex="0x00d834" dec="0055348" oct="0154064" string="?" utf-16="fffd" utf-8="3f" character_name="HIGH SURROGATES D834"
codepoint_index="007" character_count="1" unicode_id="U+0DD22" hex="0x00dd22" dec="0056610" oct="0156442" string="?" utf-16="fffd" utf-8="3f" character_name="LOW SURROGATES DD22"
codepoint_index="008" character_count="1" unicode_id="U+00031" hex="0x000031" dec="0000049" oct="0000061" string="1" utf-16="0031" utf-8="31" character_name="DIGIT ONE"
codepoint_index="009" character_count="1" unicode_id="U+00032" hex="0x000032" dec="0000050" oct="0000062" string="2" utf-16="0032" utf-8="32" character_name="DIGIT TWO"
</pre>
 
=={{header|jq}}==
jq being stream-oriented, it makes sense to define `readc` so that it emits a stream of the UTF-8 characters in the input:
<langsyntaxhighlight lang="jq">def readc:
inputs + "\n" | explode[] | [.] | implode;</langsyntaxhighlight>
 
Example:
<syntaxhighlight lang="sh">
<lang sh>
echo '过活' | jq -Rn 'include "readc"; readc'
"过"
"活"
"\n"</langsyntaxhighlight>
 
=={{header|Julia}}==
Line 359 ⟶ 450:
The built-in <code>read(stream, Char)</code> function reads a single UTF8-encoded character from a given stream.
 
<langsyntaxhighlight lang="julia">open("myfilename") do f
while !eof(f)
c = read(f, Char)
println(c)
end
end</langsyntaxhighlight>
 
=={{header|Kotlin}}==
<langsyntaxhighlight lang="scala">// version 1.1.2
 
import java.io.File
Line 382 ⟶ 473:
}
}
}</langsyntaxhighlight>
 
=={{header|Lua}}==
{{works with|Lua|5.3}}
<syntaxhighlight lang="lua">
<lang Lua>
-- Return whether the given string is a single ASCII character.
function is_ascii (str)
Line 444 ⟶ 535:
end
end
</syntaxhighlight>
</lang>
{{out}}
𝄞 A ö Ж € 𝄞 Ε λ λ η ν ι κ ά y ä ® € 成 长 汉
Line 451 ⟶ 542:
from revision 27, version 9.3, of M2000 Environment, Chinese 长 letter displayed in console (as displayed in editor)
 
<syntaxhighlight lang="m2000 interpreter">
<lang M2000 Interpreter>
Module checkit {
\\ prepare a file
Line 505 ⟶ 596:
}
checkit
</syntaxhighlight>
</lang>
 
using document as final$
 
<syntaxhighlight lang="m2000 interpreter">
<lang M2000 Interpreter>
Module checkit {
\\ prepare a file
Line 569 ⟶ 660:
checkit
 
</syntaxhighlight>
</lang>
 
=={{header|Mathematica}}/{{header|Wolfram Language}}==
<syntaxhighlight lang="mathematica">str = OpenRead["file.txt"];
ToString[Read[str, "Character"], CharacterEncoding -> "UTF-8"]</syntaxhighlight>
 
=={{header|NetRexx}}==
Line 579 ⟶ 674:
 
:The file <tt>data/utf8-001.txt</tt> it a UTF-8 encoded text file containing the following:&nbsp;&#x79;&#xE4;&#xAE;&#x20AC;&#x1D11E;&#x1D122;&#x31;&#x32;.
<langsyntaxhighlight NetRexxlang="netrexx">/* NetRexx */
options replace format comments java crossref symbols nobinary
numeric digits 20
Line 692 ⟶ 787:
say
return
</syntaxhighlight>
</lang>
{{out}}
<pre>
Line 711 ⟶ 806:
</pre>
 
=={{header|Nim}}==
As most system languages, Nim reads bytes and provides functions to decode bytes into Unicode runes. The normal way to read a stream of UTF-8 characters would be to read the file line by line and decode each line using the “utf-8” iterator which yields UTF-8 characters as strings (one by one) or using the “runes” iterator which yields the UTF-8 characters as Runes (one by one).
 
As in fact the file would be read line by line, even if the characters are actually yielded one by one, it may be considered as cheating. So, we provide a function and an iterator which read bytes one by one.
 
<syntaxhighlight lang="nim">import unicode
 
proc readUtf8(f: File): string =
## Return next UTF-8 character as a string.
while true:
result.add f.readChar()
if result.validateUtf8() == -1: break
 
iterator readUtf8(f: File): string =
## Yield successive UTF-8 characters from file "f".
var res: string
while not f.endOfFile:
res.setLen(0)
while true:
res.add f.readChar()
if res.validateUtf8() == -1: break
yield res</syntaxhighlight>
 
=={{header|Pascal}}==
<syntaxhighlight lang="pascal">(* Read a file char by char *)
program ReadFileByChar;
var
InputFile,OutputFile: file of char;
InputChar: char;
begin
Assign(InputFile, 'testin.txt');
Reset(InputFile);
Assign(OutputFile, 'testout.txt');
Rewrite(OutputFile);
while not Eof(InputFile) do
begin
Read(InputFile, InputChar);
Write(OutputFile, InputChar)
end;
Close(InputFile);
Close(OutputFile)
end.
</syntaxhighlight>
=={{header|Perl}}==
<langsyntaxhighlight lang="perl">binmode STDOUT, ':utf8'; # so we can print wide chars without warning
 
open my $fh, "<:encoding(UTF-8)", "input.txt" or die "$!\n";
Line 720 ⟶ 858:
}
 
close $fh;</langsyntaxhighlight>
 
If the contents of the ''input.txt'' file are <code>aă€⼥</code> then the output would be:
Line 736 ⟶ 874:
precisely one unicode character from a file. If there is a genuine demand for it, I
could easily add this to that file permanently, and document/autoinclude it properly.
<lang Phix>constant INVALID_UTF8 = #FFFD
 
<!--<syntaxhighlight lang="phix">-->
function get_one_utf8_char(integer fn)
<span style="color: #008080;">constant</span> <span style="color: #000000;">INVALID_UTF8</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">#FFFD</span>
-- returns INVALID_UTF8 on error, else a string of 1..4 bytes representing one character
object res
<span style="color: #008080;">function</span> <span style="color: #000000;">get_one_utf8_char</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
integer headb, bytes, c
<span style="color: #000080;font-style:italic;">-- returns INVALID_UTF8 on error, else a string of 1..4 bytes representing one character</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">res</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">c</span>
<span style="color: #000080;font-style:italic;">-- headb = first byte of utf-8 character:</span>
<span style="color: #000000;">headb</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">headb</span>
<span style="color: #000080;font-style:italic;">-- calculate length of utf-8 character in bytes (1..4):</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- (utf-8 starts at #0)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b01111111</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span> <span style="color: #000080;font-style:italic;">-- 0b_0xxx_xxxx</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b10111111</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- (it's a tail byte)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b11011111</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">2</span> <span style="color: #000080;font-style:italic;">-- 0b_110x_xxxx</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b11101111</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">3</span> <span style="color: #000080;font-style:italic;">-- 0b_1110_xxxx</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">headb</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0b11110100</span> <span style="color: #008080;">then</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">4</span> <span style="color: #000080;font-style:italic;">-- 0b_1111_0xzz</span>
<span style="color: #008080;">else</span> <span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- (utf-8 ends at #10FFFF)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000080;font-style:italic;">-- 2..4 bytes encoding (tail range: 0b_1000_0000..0b_1011_1111);</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">-</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span> <span style="color: #000080;font-style:italic;">-- tail bytes are valid?</span>
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><</span><span style="color: #000000;">#80</span> <span style="color: #008080;">or</span> <span style="color: #000000;">c</span><span style="color: #0000FF;">></span><span style="color: #000000;">#BF</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">bytes</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- invalid tail byte or eof</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">c</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #000080;font-style:italic;">-- 1 byte encoding (head range: 0b_0000_0000..0b_0111_1111):</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">headb</span> <span style="color: #000080;font-style:italic;">-- UTF-8 = ASCII
-- 2 bytes encoding (head range: 0b_1100_0000..0b_1101_1111):</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">=</span><span style="color: #000000;">2</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">headb</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">#1F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#40</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b110[7..11] headb</span>
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- 0b10[1..6] tail</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;">></span><span style="color: #000000;">#7FF</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #000080;font-style:italic;">-- sanity check</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><</span><span style="color: #000000;">#80</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- long form?</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">INVALID_UTF8</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000080;font-style:italic;">-- 3 bytes encoding (head range: 0b_1110_0000..0b_1110_1111):</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">=</span><span style="color: #000000;">3</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">headb</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">#0F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#1000</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b1110[13..16] head</span>
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#40</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b10[7..12] tail</span>
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- 0b10[1..6] tail</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;">></span><span style="color: #000000;">#FFFF</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #000080;font-style:italic;">-- sanity check</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><</span><span style="color: #000000;">#800</span> <span style="color: #000080;font-style:italic;">-- long form?</span>
<span style="color: #008080;">or</span> <span style="color: #0000FF;">(</span><span style="color: #000000;">c</span><span style="color: #0000FF;">>=</span><span style="color: #000000;">#D800</span> <span style="color: #008080;">and</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">#DFFF</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- utf-16 incompatible</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">INVALID_UTF8</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000080;font-style:italic;">-- 4 bytes encoding (head range: 0b_1111_0000..0b_1111_0111):</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">bytes</span><span style="color: #0000FF;">=</span><span style="color: #000000;">4</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">c</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">headb</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">#07</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#040000</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b11110[19..21] head</span>
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#1000</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b10[13..18] tail</span>
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)*</span><span style="color: #000000;">#0040</span> <span style="color: #0000FF;">+</span> <span style="color: #000080;font-style:italic;">-- 0b10[7..12] tail</span>
<span style="color: #7060A8;">and_bits</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">4</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">#3F</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- 0b10[1..6] tail</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">c</span><span style="color: #0000FF;"><</span><span style="color: #000000;">#10000</span> <span style="color: #000080;font-style:italic;">-- long form?</span>
<span style="color: #008080;">or</span> <span style="color: #000000;">c</span><span style="color: #0000FF;">></span><span style="color: #000000;">#10FFFF</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">INVALID_UTF8</span> <span style="color: #000080;font-style:italic;">-- utf-8 ends at #10FFFF</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000080;font-style:italic;">-- bytes = 0; current byte is not encoded correctly:</span>
<span style="color: #008080;">else</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">INVALID_UTF8</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
 
Test code:
-- headb = first byte of utf-8 character:
headb = getc(fn)
if headb=-1 then return -1 end if
res = ""&headb
 
-- calculate length of utf-8 character in bytes (1..4):
if headb<0 then bytes = 0 -- (utf-8 starts at #0)
elsif headb<=0b01111111 then bytes = 1 -- 0b_0xxx_xxxx
elsif headb<=0b10111111 then bytes = 0 -- (it's a tail byte)
elsif headb<=0b11011111 then bytes = 2 -- 0b_110x_xxxx
elsif headb<=0b11101111 then bytes = 3 -- 0b_1110_xxxx
elsif headb<=0b11110100 then bytes = 4 -- 0b_1111_0xzz
else bytes = 0 -- (utf-8 ends at #10FFFF)
end if
 
-- 2..4 bytes encoding (tail range: 0b_1000_0000..0b_1011_1111);
for j=1 to bytes-1 do -- tail bytes are valid?
c = getc(fn)
if c<#80 or c>#BF then
bytes = 0 -- invalid tail byte or eof
exit
end if
res &= c
end for
 
-- 1 byte encoding (head range: 0b_0000_0000..0b_0111_1111):
if bytes=1 then
c = headb -- UTF-8 = ASCII
 
-- 2 bytes encoding (head range: 0b_1100_0000..0b_1101_1111):
elsif bytes=2 then
c = and_bits(headb, #1F)*#40 + -- 0b110[7..11] headb
and_bits(res[2], #3F) -- 0b10[1..6] tail
if c>#7FF then ?9/0 end if -- sanity check
if c<#80 then -- long form?
res = INVALID_UTF8
end if
 
-- 3 bytes encoding (head range: 0b_1110_0000..0b_1110_1111):
elsif bytes=3 then
c = and_bits(headb, #0F)*#1000 + -- 0b1110[13..16] head
and_bits(res[2], #3F)*#40 + -- 0b10[7..12] tail
and_bits(res[3], #3F) -- 0b10[1..6] tail
if c>#FFFF then ?9/0 end if -- sanity check
if c<#800 -- long form?
or (c>=#D800 and c<=#DFFF) then -- utf-16 incompatible
res = INVALID_UTF8
end if
 
<!--<syntaxhighlight lang="phix">-->
-- 4 bytes encoding (head range: 0b_1111_0000..0b_1111_0111):
<span style="color: #000080;font-style:italic;">--string utf8 = "aă€⼥" -- (same results as next)</span>
elsif bytes=4 then
<span style="color: #004080;">string</span> <span style="color: #000000;">utf8</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">utf32_to_utf8</span><span style="color: #0000FF;">({</span><span style="color: #000000;">#0061</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0103</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#20ac</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#2f25</span><span style="color: #0000FF;">})</span>
c = and_bits(headb, #07)*#040000 + -- 0b11110[19..21] head
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"length of utf8 is %d bytes\n"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">utf8</span><span style="color: #0000FF;">))</span>
and_bits(res[2], #3F)*#1000 + -- 0b10[13..18] tail
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"test.txt"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"wb"</span><span style="color: #0000FF;">)</span>
and_bits(res[3], #3F)*#0040 + -- 0b10[7..12] tail
<span style="color: #7060A8;">puts</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">,</span><span style="color: #000000;">utf8</span><span style="color: #0000FF;">)</span>
and_bits(res[4], #3F) -- 0b10[1..6] tail
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
if c<#10000 -- long form?
<span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"test.txt"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"r"</span><span style="color: #0000FF;">)</span>
or c>#10FFFF then
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">5</span> <span style="color: #008080;">do</span>
res = INVALID_UTF8 -- utf-8 ends at #10FFFF
<span style="color: #004080;">object</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_one_utf8_char</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
end if
<span style="color: #008080;">if</span> <span style="color: #004080;">string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #000000;">LINUX</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"char %d (%s) is %d bytes\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)})</span>
<span style="color: #008080;">else</span>
<span style="color: #000080;font-style:italic;">-- unicode and consoles tricky on windows, so I'm
-- just avoiding that issue altogther (t)here.</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"char %d is %d bytes\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"char %d - EOF\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">i</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">else</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"char %d - INVALID_UTF8\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">i</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
<!--</syntaxhighlight>-->
 
-- bytes = 0; current byte is not encoded correctly:
else
res = INVALID_UTF8
end if
 
return res
end function</lang>
Test code:
<lang Phix>--string utf8 = "aă€⼥" -- (same results as next)
string utf8 = utf32_to_utf8({#0061,#0103,#20ac,#2f25})
printf(1,"length of utf8 is %d bytes\n",length(utf8))
integer fn = open("test.txt","wb")
puts(fn,utf8)
close(fn)
fn = open("test.txt","r")
for i=1 to 5 do
object res = get_one_utf8_char(fn)
if string(res) then
if platform()=LINUX then
printf(1,"char %d (%s) is %d bytes\n",{i,res,length(res)})
else
-- unicode and consoles tricky on windows, so I'm
-- just avoiding that issue altogther (t)here.
printf(1,"char %d is %d bytes\n",{i,length(res)})
end if
elsif res=-1 then
printf(1,"char %d - EOF\n",i)
exit
else
printf(1,"char %d - INVALID_UTF8\n",i)
exit
end if
end for
close(fn)</lang>
{{out}}
<pre>
Line 849 ⟶ 995:
=={{header|PicoLisp}}==
Pico Lisp uses UTF-8 until told otherwise.
<syntaxhighlight lang="picolisp">
<lang PicoLisp>
(in "wordlist"
(while (char)
(process @))
</syntaxhighlight>
</lang>
 
=={{header|Python}}==
{{works with|Python|2.7}}
<langsyntaxhighlight lang="python">
def get_next_character(f):
# note: assumes valid utf-8
Line 878 ⟶ 1,024:
for c in get_next_character(f):
print(c)
</syntaxhighlight>
</lang>
 
{{works with|Python|3}}
Python 3 simplifies the handling of text files since you can specify an encoding.
<langsyntaxhighlight lang="python">def get_next_character(f):
"""Reads one character from the given textfile"""
c = f.read(1)
Line 892 ⟶ 1,038:
with open("input.txt", encoding="utf-8") as f:
for c in get_next_character(f):
print(c, sep="", end="")</langsyntaxhighlight>
 
=={{header|QBasic}}==
<syntaxhighlight lang="qbasic">f = FREEFILE
filename$ = "file.txt"
 
OPEN filename$ FOR BINARY AS #f
WHILE NOT EOF(f)
char$ = STR$(LOF(f))
GET #f, , char$
PRINT char$;
WEND
CLOSE #f</syntaxhighlight>
 
=={{header|Racket}}==
Don't we all love self reference?
<langsyntaxhighlight lang="racket">
#lang racket
; This file contains utf-8 charachters: λ, α, γ ...
(for ([c (in-port read-char (open-input-file "read-file.rkt"))])
(display c))
</syntaxhighlight>
</lang>
Output:
<langsyntaxhighlight lang="racket">
#lang racket
; This file contains utf-8 charachters: λ, α, γ ...
(for ([c (in-port read-char (open-input-file "read-file.rkt"))])
(display c))
</syntaxhighlight>
</lang>
 
=={{header|Raku}}==
Line 916 ⟶ 1,074:
 
To read a single character at a time from the Standard Input terminal; $*IN in Raku:
<syntaxhighlight lang="raku" perl6line>.say while defined $_ = $*IN.getc;</langsyntaxhighlight>
 
Or, from a file:
<syntaxhighlight lang="raku" perl6line>my $filename = 'whatever';
 
my $in = open( $filename, :r ) orelse .die;
 
print $_ while defined $_ = $in.getc;</langsyntaxhighlight>
 
=={{header|REXX}}==
Line 930 ⟶ 1,088:
<br>The task's requirement stated that '''EOF''' was to be returned upon reaching the end-of-file, so this programming example was written as a subroutine (procedure).
<br>Note that displaying of characters that may modify screen behavior such as tab usage, backspaces, line feeds, carriage returns, "bells" and others are suppressed, but their hexadecimal equivalents are displayed.
<langsyntaxhighlight lang="rexx">/*REXX program reads and displays a file char by char, returning 'EOF' when done. */
parse arg iFID . /*iFID: is the fileID to be read. */
/* [↓] show the file's contents. */
Line 940 ⟶ 1,098:
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
getchar: procedure; parse arg z; if chars(z)==0 then return 'EOF'; return charin(z)</langsyntaxhighlight>
'''input''' &nbsp; file: &nbsp; '''ABC'''
<br>and was created by the DOS command (under Windows/XP): &nbsp; &nbsp; '''echo 123 [¬ a prime]> ABC'''
Line 970 ⟶ 1,128:
 
===version 2===
<langsyntaxhighlight lang="rexx">/* REXX ---------------------------------------------------------------
* 29.12.2013 Walter Pachl
* read one utf8 character at a time
Line 1,009 ⟶ 1,167:
Return c
 
c2b: Return x2b(c2x(arg(1)))</langsyntaxhighlight>
output:
<pre>y 79
Line 1,019 ⟶ 1,177:
 
=={{header|Ring}}==
<langsyntaxhighlight lang="ring">
fp = fopen("C:\Ring\ReadMe.txt","r")
r = fgetc(fp)
Line 1,027 ⟶ 1,185:
end
fclose(fp)
</syntaxhighlight>
</lang>
Output:
<pre>
Line 1,055 ⟶ 1,213:
{{works with|Ruby|1.9}}
 
<langsyntaxhighlight lang="ruby">File.open('input.txt', 'r:utf-8') do |f|
f.each_char{|c| p c}
end</langsyntaxhighlight>
 
or
 
<langsyntaxhighlight lang="ruby">File.open('input.txt', 'r:utf-8') do |f|
while c = f.getc
p c
end
end</langsyntaxhighlight>
 
=={{header|Run BASIC}}==
<langsyntaxhighlight lang="runbasic">open file.txt" for binary as #f
numChars = 1 ' specify number of characters to read
a$ = input$(#f,numChars) ' read number of characters specified
b$ = input$(#f,1) ' read one character
close #f</langsyntaxhighlight>
 
=={{header|Rust}}==
Rust standard library provides hardly any straight-forward way to read single UTF-8 characters
from a file. Following code implements an iterator that consumes a byte stream, taking only as
many bytes as necessary to decode the next UTF-8 character. It provides quite a complete error
report, so that the client code can leverage it to deal with corrupted input.
 
The decoding code is based on [https://docs.rs/crate/utf8-decode/1.0.0/source/ utf8-decode] crate
originally.
 
<syntaxhighlight lang="rust">use std::{
convert::TryFrom,
fmt::{Debug, Display, Formatter},
io::Read,
};
 
pub struct ReadUtf8<I: Iterator> {
source: std::iter::Peekable<I>,
}
 
impl<R: Read> From<R> for ReadUtf8<std::io::Bytes<R>> {
fn from(source: R) -> Self {
ReadUtf8 {
source: source.bytes().peekable(),
}
}
}
 
impl<I, E> Iterator for ReadUtf8<I>
where
I: Iterator<Item = Result<u8, E>>,
{
type Item = Result<char, Error<E>>;
 
fn next(&mut self) -> Option<Self::Item> {
self.source.next().map(|next| match next {
Ok(lead) => self.complete_char(lead),
Err(e) => Err(Error::SourceError(e)),
})
}
}
 
impl<I, E> ReadUtf8<I>
where
I: Iterator<Item = Result<u8, E>>,
{
fn continuation(&mut self) -> Result<u32, Error<E>> {
if let Some(Ok(byte)) = self.source.peek() {
let byte = *byte;
 
return if byte & 0b1100_0000 == 0b1000_0000 {
self.source.next();
Ok((byte & 0b0011_1111) as u32)
} else {
Err(Error::InvalidByte(byte))
};
}
 
match self.source.next() {
None => Err(Error::InputTruncated),
Some(Err(e)) => Err(Error::SourceError(e)),
Some(Ok(_)) => unreachable!(),
}
}
 
fn complete_char(&mut self, lead: u8) -> Result<char, Error<E>> {
let a = lead as u32; // Let's name the bytes in the sequence
 
let result = if a & 0b1000_0000 == 0 {
Ok(a)
} else if lead & 0b1110_0000 == 0b1100_0000 {
let b = self.continuation()?;
Ok((a & 0b0001_1111) << 6 | b)
} else if a & 0b1111_0000 == 0b1110_0000 {
let b = self.continuation()?;
let c = self.continuation()?;
Ok((a & 0b0000_1111) << 12 | b << 6 | c)
} else if a & 0b1111_1000 == 0b1111_0000 {
let b = self.continuation()?;
let c = self.continuation()?;
let d = self.continuation()?;
Ok((a & 0b0000_0111) << 18 | b << 12 | c << 6 | d)
} else {
Err(Error::InvalidByte(lead))
};
 
Ok(char::try_from(result?).unwrap())
}
}
 
#[derive(Debug, Clone)]
pub enum Error<E> {
InvalidByte(u8),
InputTruncated,
SourceError(E),
}
 
impl<E: Display> Display for Error<E> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidByte(b) => write!(f, "invalid byte 0x{:x}", b),
Self::InputTruncated => write!(f, "character truncated"),
Self::SourceError(e) => e.fmt(f),
}
}
}
 
fn main() -> std::io::Result<()> {
for (index, value) in ReadUtf8::from(std::fs::File::open("test.txt")?).enumerate() {
match value {
Ok(c) => print!("{}", c),
 
Err(e) => {
print!("\u{fffd}");
eprintln!("offset {}: {}", index, e);
}
}
}
 
Ok(())
}</syntaxhighlight>
 
 
=={{header|Seed7}}==
Line 1,082 ⟶ 1,362:
the file [http://seed7.sourceforge.net/libraries/utf8.htm#STD_UTF8_OUT STD_UTF8_OUT] is used.
 
<langsyntaxhighlight lang="seed7">$ include "seed7_05.s7i";
include "utf8.s7i";
 
Line 1,099 ⟶ 1,379:
close(inFile);
end if;
end func;</langsyntaxhighlight>
 
{{out}}
Line 1,111 ⟶ 1,391:
 
=={{header|Sidef}}==
<langsyntaxhighlight lang="ruby">var file = File('input.txt') # the input file contains: "aă€⼥"
var fh = file.open_r # equivalent with: file.open('<:utf8')
fh.each_char { |char|
printf("got character #{char} [U+%04x]\n", char.ord)
}</langsyntaxhighlight>
{{out}}
<pre>
Line 1,123 ⟶ 1,403:
got character ⼥ [U+2f25]
</pre>
 
=={{header|Smalltalk}}==
{{works with|Smalltalk/X}}
<syntaxhighlight lang="smalltalk">|utfStream|
utfStream := 'input' asFilename readStream asUTF8EncodedStream.
[utfStream atEnd] whileFalse:[
Transcript showCR:'got char ',utfStream next.
].
utfStream close.</syntaxhighlight>
 
=={{header|Tcl}}==
To read a single character from a file, use:
<langsyntaxhighlight lang="tcl">set ch [read $channel 1]</langsyntaxhighlight>
This will read multiple bytes sufficient to obtain a Unicode character if a suitable encoding has been configured on the channel. For binary channels, this will always consume exactly one byte. However, the low-level channel buffering logic may consume more than one byte (which only really matters where the channel is being handed on to another process and the channel is over a file descriptor that doesn't support the <tt>lseek</tt> OS call); the extent of buffering can be controlled via:
<syntaxhighlight lang ="tcl">fconfigure $channel -buffersize $byteCount</langsyntaxhighlight>
When the channel is only being accessed from Tcl (or via Tcl's C API) it is not normally necessary to adjust this option.
 
=={{header|V (Vlang)}}==
<syntaxhighlight lang="v (vlang)">
import os
 
fn main() {
file := './file.txt'
mut content_arr := []u8{}
if os.is_file(file) == true {
content_arr << os.read_bytes(file) or {
println('Error: can not read')
exit(1)
}
}
else {
println('Error: can not find file')
exit(1)
}
 
println(content_arr.bytestr())
}
</syntaxhighlight>
 
=={{header|Wren}}==
<langsyntaxhighlight ecmascriptlang="wren">import "io" for File
 
File.open("input.txt") { |file|
Line 1,147 ⟶ 1,459:
offset = offset + 1
}
}</langsyntaxhighlight>
 
=={{header|zkl}}==
zkl doesn't know much about UTF-8 or Unicode but is able to test whether a string or number is valid UTF-8 or not. This code uses that to build a state machine to decode a byte stream into UTF-8 characters.
<langsyntaxhighlight lang="zkl">fcn readUTF8c(chr,s=""){ // transform UTF-8 character stream
s+=chr;
try{ s.len(8); return(s) }
catch{ if(s.len()>6) throw(__exception) } // 6 bytes max for UTF-8
return(Void.Again,s); // call me again with s & another character
}</langsyntaxhighlight>
Used to modify a zkl iterator, it can consume any stream-able (files, strings, lists, etc) and provides support for foreach, map, look ahead, push back, etc.
<langsyntaxhighlight lang="zkl">fcn utf8Walker(obj){
obj.walker(3) // read characters
.tweak(readUTF8c)
}</langsyntaxhighlight>
<langsyntaxhighlight lang="zkl">s:="-->\u20AC123"; // --> e2,82,ac,31,32,33 == -->€123
utf8Walker(s).walk().println();
 
Line 1,168 ⟶ 1,480:
foreach c in (utf8Walker(Data(Void,s,"\n"))){ print(c) }
 
utf8Walker(Data(Void,0xe2,0x82,"123456")).walk().println(); // € is short 1 byte</langsyntaxhighlight>
{{out}}
<pre>
Line 1,177 ⟶ 1,489:
</pre>
If you wish to push a UTF-8 stream through one or more functions, you can use the same state machine:
<langsyntaxhighlight lang="zkl">stream:=Data(Void,s,"\n").howza(3); // character stream
stream.pump(List,readUTF8c,"print")</langsyntaxhighlight>
{{out}}<pre>-->€123</pre>
and returns a list of the eight UTF-8 characters (with newline).
Or, if file "foo.txt" contains the characters:
<langsyntaxhighlight lang="zkl">File("foo.txt","rb").howza(3).pump(List,readUTF8c,"print");</langsyntaxhighlight>
produces the same result.
 
9,482

edits