Read a file character by character/UTF8: Difference between revisions

Content added Content deleted
m (→‎{{header|Haskell}}: (specified imports))
(Add source for Rust)
Line 16: Line 16:
<lang AutoHotkey>File := FileOpen("input.txt", "r")
<lang AutoHotkey>File := FileOpen("input.txt", "r")
while !File.AtEOF
while !File.AtEOF
MsgBox, % File.Read(1)</lang>
MsgBox, % File.Read(1)</lang>


=={{header|C}}==
=={{header|C}}==
Line 103: Line 103:
<lang dejavu>#helper function that deals with non-ASCII code points
<lang dejavu>#helper function that deals with non-ASCII code points
local (read-utf8-char) file tmp:
local (read-utf8-char) file tmp:
!read-byte file
!read-byte file
if = :eof dup:
if = :eof dup:
drop
drop
raise :unicode-error
raise :unicode-error
resize-blob tmp ++ dup len tmp
resize-blob tmp ++ dup len tmp
set-to tmp
set-to tmp
try:
try:
return !decode!utf-8 tmp
return !decode!utf-8 tmp
catch unicode-error:
catch unicode-error:
if < 3 len tmp:
if < 3 len tmp:
raise :unicode-error
raise :unicode-error
(read-utf8-char) file tmp
(read-utf8-char) file tmp


#reader function
#reader function
read-utf8-char file:
read-utf8-char file:
!read-byte file
!read-byte file
if = :eof dup:
if = :eof dup:
return
return
local :tmp make-blob 1
local :tmp make-blob 1
set-to tmp 0
set-to tmp 0
try:
try:
return !decode!utf-8 tmp
return !decode!utf-8 tmp
catch unicode-error:
catch unicode-error:
(read-utf8-char) file tmp
(read-utf8-char) file tmp


#if the module is used as a script, read from the file "input.txt",
#if the module is used as a script, read from the file "input.txt",
#showing each code point separately
#showing each code point separately
if = (name) :(main):
if = (name) :(main):
local :file !open :read "input.txt"
local :file !open :read "input.txt"


while true:
while true:
read-utf8-char file
read-utf8-char file
if = :eof dup:
if = :eof dup:
drop
drop
!close file
!close file
return
return
!.</lang>
!.</lang>


=={{header|Factor}}==
=={{header|Factor}}==
Line 156: Line 156:
while (ch = r.read()) != -1
while (ch = r.read()) != -1
print( chr(ch) )
print( chr(ch) )
r.close()</lang>
r.close()</lang>


Line 163: Line 163:


import (
import (
"bufio"
"bufio"
"fmt"
"fmt"
"io"
"io"
"os"
"os"
)
)


func Runer(r io.RuneReader) func() (rune, error) {
func Runer(r io.RuneReader) func() (rune, error) {
return func() (r rune, err error) {
return func() (r rune, err error) {
r, _, err = r.ReadRune()
r, _, err = r.ReadRune()
return
return
}
}
}
}


func main() {
func main() {
runes := Runer(bufio.NewReader(os.Stdin))
runes := Runer(bufio.NewReader(os.Stdin))
for r, err := runes(); err != nil; r,err = runes() {
for r, err := runes(); err != nil; r,err = runes() {
fmt.Printf("%c", r)
fmt.Printf("%c", r)
}
}
}</lang>
}</lang>


Line 1,128: Line 1,128:
b$ = input$(#f,1) ' read one character
b$ = input$(#f,1) ' read one character
close #f</lang>
close #f</lang>

=={{header|Rust}}==
Rust standard library provides hardly any straight-forward way to read single UTF-8 characters
from a file. Following code implements an iterator that consumes a byte stream, taking only as
many bytes as necessary to decode the next UTF-8 character. It provides quite a complete error
report, so that the client code can leverage it to deal with corrupted input.

The decoding code is based on [https://docs.rs/crate/utf8-decode/1.0.0/source/ utf8-decode] crate
originally.

<lang Rust>use std::{
convert::TryFrom,
fmt::{Debug, Display, Formatter},
io::Read,
};

pub struct ReadUtf8<I: Iterator> {
source: std::iter::Peekable<I>,
}

impl<R: Read> From<R> for ReadUtf8<std::io::Bytes<R>> {
fn from(source: R) -> Self {
ReadUtf8 {
source: source.bytes().peekable(),
}
}
}

impl<I, E> Iterator for ReadUtf8<I>
where
I: Iterator<Item = Result<u8, E>>,
{
type Item = Result<char, Error<E>>;

fn next(&mut self) -> Option<Self::Item> {
self.source.next().map(|next| match next {
Ok(lead) => self.complete_char(lead),
Err(e) => Err(Error::SourceError(e)),
})
}
}

impl<I, E> ReadUtf8<I>
where
I: Iterator<Item = Result<u8, E>>,
{
fn continuation(&mut self) -> Result<u32, Error<E>> {
if let Some(Ok(byte)) = self.source.peek() {
let byte = *byte;

return if byte & 0b1100_0000 == 0b1000_0000 {
self.source.next();
Ok((byte & 0b0011_1111) as u32)
} else {
Err(Error::InvalidByte(byte))
};
}

match self.source.next() {
None => Err(Error::InputTruncated),
Some(Err(e)) => Err(Error::SourceError(e)),
Some(Ok(_)) => unreachable!(),
}
}

fn complete_char(&mut self, lead: u8) -> Result<char, Error<E>> {
let a = lead as u32; // Let's name the bytes in the sequence

let result = if a & 0b1000_0000 == 0 {
Ok(a)
} else if lead & 0b1110_0000 == 0b1100_0000 {
let b = self.continuation()?;
Ok((a & 0b0001_1111) << 6 | b)
} else if a & 0b1111_0000 == 0b1110_0000 {
let b = self.continuation()?;
let c = self.continuation()?;
Ok((a & 0b0000_1111) << 12 | b << 6 | c)
} else if a & 0b1111_1000 == 0b1111_0000 {
let b = self.continuation()?;
let c = self.continuation()?;
let d = self.continuation()?;
Ok((a & 0b0000_0111) << 18 | b << 12 | c << 6 | d)
} else {
Err(Error::InvalidByte(lead))
};

Ok(char::try_from(result?).unwrap())
}
}

#[derive(Debug, Clone)]
pub enum Error<E> {
InvalidByte(u8),
InputTruncated,
SourceError(E),
}

impl<E: Display> Display for Error<E> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidByte(b) => write!(f, "invalid byte 0x{:x}", b),
Self::InputTruncated => write!(f, "character truncated"),
Self::SourceError(e) => e.fmt(f),
}
}
}

fn main() -> std::io::Result<()> {
for (index, value) in ReadUtf8::from(std::fs::File::open("test.txt")?).enumerate() {
match value {
Ok(c) => print!("{}", c),

Err(e) => {
print!("\u{fffd}");
eprintln!("offset {}: {}", index, e);
}
}
}

Ok(())
}</lang>



=={{header|Seed7}}==
=={{header|Seed7}}==