Read a file character by character/UTF8: Difference between revisions

Add source for Rust
m (→‎{{header|Haskell}}: (specified imports))
(Add source for Rust)
Line 16:
<lang AutoHotkey>File := FileOpen("input.txt", "r")
while !File.AtEOF
MsgBox, % File.Read(1)</lang>
 
=={{header|C}}==
Line 103:
<lang dejavu>#helper function that deals with non-ASCII code points
local (read-utf8-char) file tmp:
!read-byte file
if = :eof dup:
drop
raise :unicode-error
resize-blob tmp ++ dup len tmp
set-to tmp
try:
return !decode!utf-8 tmp
catch unicode-error:
if < 3 len tmp:
raise :unicode-error
(read-utf8-char) file tmp
 
#reader function
read-utf8-char file:
!read-byte file
if = :eof dup:
return
local :tmp make-blob 1
set-to tmp 0
try:
return !decode!utf-8 tmp
catch unicode-error:
(read-utf8-char) file tmp
 
#if the module is used as a script, read from the file "input.txt",
#showing each code point separately
if = (name) :(main):
local :file !open :read "input.txt"
 
while true:
read-utf8-char file
if = :eof dup:
drop
!close file
return
!.</lang>
 
=={{header|Factor}}==
Line 156:
while (ch = r.read()) != -1
print( chr(ch) )
r.close()</lang>
 
Line 163:
 
import (
"bufio"
"fmt"
"io"
"os"
)
 
func Runer(r io.RuneReader) func() (rune, error) {
return func() (r rune, err error) {
r, _, err = r.ReadRune()
return
}
}
}
 
func main() {
runes := Runer(bufio.NewReader(os.Stdin))
for r, err := runes(); err != nil; r,err = runes() {
fmt.Printf("%c", r)
}
}
}</lang>
 
Line 1,128:
b$ = input$(#f,1) ' read one character
close #f</lang>
 
=={{header|Rust}}==
Rust standard library provides hardly any straight-forward way to read single UTF-8 characters
from a file. Following code implements an iterator that consumes a byte stream, taking only as
many bytes as necessary to decode the next UTF-8 character. It provides quite a complete error
report, so that the client code can leverage it to deal with corrupted input.
 
The decoding code is based on [https://docs.rs/crate/utf8-decode/1.0.0/source/ utf8-decode] crate
originally.
 
<lang Rust>use std::{
convert::TryFrom,
fmt::{Debug, Display, Formatter},
io::Read,
};
 
pub struct ReadUtf8<I: Iterator> {
source: std::iter::Peekable<I>,
}
 
impl<R: Read> From<R> for ReadUtf8<std::io::Bytes<R>> {
fn from(source: R) -> Self {
ReadUtf8 {
source: source.bytes().peekable(),
}
}
}
 
impl<I, E> Iterator for ReadUtf8<I>
where
I: Iterator<Item = Result<u8, E>>,
{
type Item = Result<char, Error<E>>;
 
fn next(&mut self) -> Option<Self::Item> {
self.source.next().map(|next| match next {
Ok(lead) => self.complete_char(lead),
Err(e) => Err(Error::SourceError(e)),
})
}
}
 
impl<I, E> ReadUtf8<I>
where
I: Iterator<Item = Result<u8, E>>,
{
fn continuation(&mut self) -> Result<u32, Error<E>> {
if let Some(Ok(byte)) = self.source.peek() {
let byte = *byte;
 
return if byte & 0b1100_0000 == 0b1000_0000 {
self.source.next();
Ok((byte & 0b0011_1111) as u32)
} else {
Err(Error::InvalidByte(byte))
};
}
 
match self.source.next() {
None => Err(Error::InputTruncated),
Some(Err(e)) => Err(Error::SourceError(e)),
Some(Ok(_)) => unreachable!(),
}
}
 
fn complete_char(&mut self, lead: u8) -> Result<char, Error<E>> {
let a = lead as u32; // Let's name the bytes in the sequence
 
let result = if a & 0b1000_0000 == 0 {
Ok(a)
} else if lead & 0b1110_0000 == 0b1100_0000 {
let b = self.continuation()?;
Ok((a & 0b0001_1111) << 6 | b)
} else if a & 0b1111_0000 == 0b1110_0000 {
let b = self.continuation()?;
let c = self.continuation()?;
Ok((a & 0b0000_1111) << 12 | b << 6 | c)
} else if a & 0b1111_1000 == 0b1111_0000 {
let b = self.continuation()?;
let c = self.continuation()?;
let d = self.continuation()?;
Ok((a & 0b0000_0111) << 18 | b << 12 | c << 6 | d)
} else {
Err(Error::InvalidByte(lead))
};
 
Ok(char::try_from(result?).unwrap())
}
}
 
#[derive(Debug, Clone)]
pub enum Error<E> {
InvalidByte(u8),
InputTruncated,
SourceError(E),
}
 
impl<E: Display> Display for Error<E> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidByte(b) => write!(f, "invalid byte 0x{:x}", b),
Self::InputTruncated => write!(f, "character truncated"),
Self::SourceError(e) => e.fmt(f),
}
}
}
 
fn main() -> std::io::Result<()> {
for (index, value) in ReadUtf8::from(std::fs::File::open("test.txt")?).enumerate() {
match value {
Ok(c) => print!("{}", c),
 
Err(e) => {
print!("\u{fffd}");
eprintln!("offset {}: {}", index, e);
}
}
}
 
Ok(())
}</lang>
 
 
=={{header|Seed7}}==
Anonymous user