Read a file character by character/UTF8: Difference between revisions
Content deleted Content added
m →{{header|Haskell}}: (specified imports) |
Add source for Rust |
||
Line 16: | Line 16: | ||
<lang AutoHotkey>File := FileOpen("input.txt", "r") |
<lang AutoHotkey>File := FileOpen("input.txt", "r") |
||
while !File.AtEOF |
while !File.AtEOF |
||
MsgBox, % File.Read(1)</lang> |
|||
=={{header|C}}== |
=={{header|C}}== |
||
Line 103: | Line 103: | ||
<lang dejavu>#helper function that deals with non-ASCII code points |
<lang dejavu>#helper function that deals with non-ASCII code points |
||
local (read-utf8-char) file tmp: |
local (read-utf8-char) file tmp: |
||
!read-byte file |
|||
if = :eof dup: |
|||
drop |
|||
raise :unicode-error |
|||
resize-blob tmp ++ dup len tmp |
|||
set-to tmp |
|||
try: |
|||
return !decode!utf-8 tmp |
|||
catch unicode-error: |
|||
if < 3 len tmp: |
|||
raise :unicode-error |
|||
(read-utf8-char) file tmp |
|||
#reader function |
#reader function |
||
read-utf8-char file: |
read-utf8-char file: |
||
!read-byte file |
|||
if = :eof dup: |
|||
return |
|||
local :tmp make-blob 1 |
|||
set-to tmp 0 |
|||
try: |
|||
return !decode!utf-8 tmp |
|||
catch unicode-error: |
|||
(read-utf8-char) file tmp |
|||
#if the module is used as a script, read from the file "input.txt", |
#if the module is used as a script, read from the file "input.txt", |
||
#showing each code point separately |
#showing each code point separately |
||
if = (name) :(main): |
if = (name) :(main): |
||
local :file !open :read "input.txt" |
|||
while true: |
|||
read-utf8-char file |
|||
if = :eof dup: |
|||
drop |
|||
!close file |
|||
return |
|||
!.</lang> |
|||
=={{header|Factor}}== |
=={{header|Factor}}== |
||
Line 156: | Line 156: | ||
while (ch = r.read()) != -1 |
while (ch = r.read()) != -1 |
||
print( chr(ch) ) |
print( chr(ch) ) |
||
r.close()</lang> |
r.close()</lang> |
||
Line 163: | Line 163: | ||
import ( |
import ( |
||
"bufio" |
|||
"fmt" |
|||
"io" |
|||
"os" |
|||
) |
) |
||
func Runer(r io.RuneReader) func() (rune, error) { |
func Runer(r io.RuneReader) func() (rune, error) { |
||
return func() (r rune, err error) { |
|||
r, _, err = r.ReadRune() |
|||
return |
|||
} |
|||
} |
|||
} |
} |
||
func main() { |
func main() { |
||
runes := Runer(bufio.NewReader(os.Stdin)) |
|||
for r, err := runes(); err != nil; r,err = runes() { |
|||
fmt.Printf("%c", r) |
|||
} |
|||
} |
|||
}</lang> |
}</lang> |
||
Line 1,128: | Line 1,128: | ||
b$ = input$(#f,1) ' read one character |
b$ = input$(#f,1) ' read one character |
||
close #f</lang> |
close #f</lang> |
||
=={{header|Rust}}== |
|||
Rust standard library provides hardly any straight-forward way to read single UTF-8 characters |
|||
from a file. Following code implements an iterator that consumes a byte stream, taking only as |
|||
many bytes as necessary to decode the next UTF-8 character. It provides quite a complete error |
|||
report, so that the client code can leverage it to deal with corrupted input. |
|||
The decoding code is based on [https://docs.rs/crate/utf8-decode/1.0.0/source/ utf8-decode] crate |
|||
originally. |
|||
<lang Rust>use std::{ |
|||
convert::TryFrom, |
|||
fmt::{Debug, Display, Formatter}, |
|||
io::Read, |
|||
}; |
|||
pub struct ReadUtf8<I: Iterator> { |
|||
source: std::iter::Peekable<I>, |
|||
} |
|||
impl<R: Read> From<R> for ReadUtf8<std::io::Bytes<R>> { |
|||
fn from(source: R) -> Self { |
|||
ReadUtf8 { |
|||
source: source.bytes().peekable(), |
|||
} |
|||
} |
|||
} |
|||
impl<I, E> Iterator for ReadUtf8<I> |
|||
where |
|||
I: Iterator<Item = Result<u8, E>>, |
|||
{ |
|||
type Item = Result<char, Error<E>>; |
|||
fn next(&mut self) -> Option<Self::Item> { |
|||
self.source.next().map(|next| match next { |
|||
Ok(lead) => self.complete_char(lead), |
|||
Err(e) => Err(Error::SourceError(e)), |
|||
}) |
|||
} |
|||
} |
|||
impl<I, E> ReadUtf8<I> |
|||
where |
|||
I: Iterator<Item = Result<u8, E>>, |
|||
{ |
|||
fn continuation(&mut self) -> Result<u32, Error<E>> { |
|||
if let Some(Ok(byte)) = self.source.peek() { |
|||
let byte = *byte; |
|||
return if byte & 0b1100_0000 == 0b1000_0000 { |
|||
self.source.next(); |
|||
Ok((byte & 0b0011_1111) as u32) |
|||
} else { |
|||
Err(Error::InvalidByte(byte)) |
|||
}; |
|||
} |
|||
match self.source.next() { |
|||
None => Err(Error::InputTruncated), |
|||
Some(Err(e)) => Err(Error::SourceError(e)), |
|||
Some(Ok(_)) => unreachable!(), |
|||
} |
|||
} |
|||
fn complete_char(&mut self, lead: u8) -> Result<char, Error<E>> { |
|||
let a = lead as u32; // Let's name the bytes in the sequence |
|||
let result = if a & 0b1000_0000 == 0 { |
|||
Ok(a) |
|||
} else if lead & 0b1110_0000 == 0b1100_0000 { |
|||
let b = self.continuation()?; |
|||
Ok((a & 0b0001_1111) << 6 | b) |
|||
} else if a & 0b1111_0000 == 0b1110_0000 { |
|||
let b = self.continuation()?; |
|||
let c = self.continuation()?; |
|||
Ok((a & 0b0000_1111) << 12 | b << 6 | c) |
|||
} else if a & 0b1111_1000 == 0b1111_0000 { |
|||
let b = self.continuation()?; |
|||
let c = self.continuation()?; |
|||
let d = self.continuation()?; |
|||
Ok((a & 0b0000_0111) << 18 | b << 12 | c << 6 | d) |
|||
} else { |
|||
Err(Error::InvalidByte(lead)) |
|||
}; |
|||
Ok(char::try_from(result?).unwrap()) |
|||
} |
|||
} |
|||
#[derive(Debug, Clone)] |
|||
pub enum Error<E> { |
|||
InvalidByte(u8), |
|||
InputTruncated, |
|||
SourceError(E), |
|||
} |
|||
impl<E: Display> Display for Error<E> { |
|||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
|||
match self { |
|||
Self::InvalidByte(b) => write!(f, "invalid byte 0x{:x}", b), |
|||
Self::InputTruncated => write!(f, "character truncated"), |
|||
Self::SourceError(e) => e.fmt(f), |
|||
} |
|||
} |
|||
} |
|||
fn main() -> std::io::Result<()> { |
|||
for (index, value) in ReadUtf8::from(std::fs::File::open("test.txt")?).enumerate() { |
|||
match value { |
|||
Ok(c) => print!("{}", c), |
|||
Err(e) => { |
|||
print!("\u{fffd}"); |
|||
eprintln!("offset {}: {}", index, e); |
|||
} |
|||
} |
|||
} |
|||
Ok(()) |
|||
}</lang> |
|||
=={{header|Seed7}}== |
=={{header|Seed7}}== |