Read a file character by character/UTF8: Difference between revisions
→{{header|Java}}: Added Common Lisp |
→{{header|Common Lisp}}: Added C |
||
Line 8: | Line 8: | ||
;See also |
;See also |
||
* [[Read a file line by line]] |
* [[Read a file line by line]] |
||
=={{header|C}}== |
|||
<lang C>#include <stdio.h> |
|||
#include <wchar.h> |
|||
#include <stdlib.h> |
|||
#include <locale.h> |
|||
int main(void) |
|||
{ |
|||
/* If your native locale doesn't use UTF-8 encoding |
|||
* you need to replace the empty string with a |
|||
* locale like "en_US.utf8" |
|||
*/ |
|||
char *locale = setlocale(LC_ALL, ""); |
|||
FILE *in = fopen("input.txt", "r"); |
|||
wint_t c; |
|||
while ((c = fgetwc(in)) != WEOF) |
|||
putwchar(c); |
|||
fclose(in); |
|||
return EXIT_SUCCESS; |
|||
}</lang> |
|||
=={{header|Common Lisp}}== |
=={{header|Common Lisp}}== |
Revision as of 16:56, 11 February 2014
Read a file one character at a time, as opposed to reading the entire file at once.
The solution may be implemented as a procedure, which returns the next character in the file on each consecutive call (returning EOF when the end of the file is reached).
The procedure should support the reading of files containing UTF8 encoded wide characters, returning whole characters for each consecutive read.
- See also
C
<lang C>#include <stdio.h>
- include <wchar.h>
- include <stdlib.h>
- include <locale.h>
int main(void) {
/* If your native locale doesn't use UTF-8 encoding * you need to replace the empty string with a * locale like "en_US.utf8" */ char *locale = setlocale(LC_ALL, ""); FILE *in = fopen("input.txt", "r");
wint_t c; while ((c = fgetwc(in)) != WEOF) putwchar(c); fclose(in);
return EXIT_SUCCESS;
}</lang>
Common Lisp
<lang lisp>;; CLISP puts the external formats into a separate package
- +clisp (import 'charset:utf-8 'keyword)
(with-open-file (s "input.txt" :external-format :utf-8)
(loop for c = (read-char s nil) while c do (format t "~a" c)))</lang>
Java
<lang java>import java.io.*;
public class RUTF8CharacterReader {
private String slurped; private String encoding; private String fName; private File fFile; // --------------------------------------------------------------------------- public String slurpChars(String fileName) { StringBuilder slrp = new StringBuilder(); fName = fileName; fFile = new File(fName); try (Reader fr = new FileReader(fFile)) { encoding = ((InputStreamReader) fr).getEncoding(); forever: for (;;) { int ic; if ((ic = fr.read()) < 0) { break forever; } char cc = (char) ic; slrp.append(cc); } } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } slurped = slrp.length() > 0 ? slrp.toString() : null; return slurped; } // --------------------------------------------------------------------------- public void encodingDetails() { String FMT_000 = "file_encoding=\"%s\" file_name=\"%s\"%n"; String FMT_001 = "unicode_string_length=\"%d\" code_point_count=\"%d\" string=\"%s\"%n"; String FMT_002 = "codepoint_index=\"%03d\" character_count=\"%d\" unicode_id=\"U+%05X\" hex=\"%#08x\" dec=\"%07d\" oct=\"%07o\" string=\"%s\" utf-16=\"%s\" utf-8=\"%s\" character_name=\"%s\"%n"; String str = slurped; System.out.printf(FMT_000, encoding, fFile.getAbsoluteFile()); System.out.printf(FMT_001, str.length(), Character.codePointCount(str, 0, str.length()), str); for (int ix = 0; ix < str.length(); ++ix) { int cp = Character.codePointAt(str, ix); int cc = Character.charCount(cp); String cpName = Character.getName(cp); String x_utf16; String x_utf8; x_utf16 = ""; x_utf8 = ""; try { x_utf16 = codePointToUTF16(cp); x_utf8 = codePointToUTF8(cp); } catch (UnsupportedEncodingException ex) { ex.printStackTrace(); } System.out.printf(FMT_002, ix, cc, cp, cp, ((long) cp & 0x00000000ffffffff), cp, new String(Character.toChars(cp)), x_utf16, x_utf8, cpName); if (cc > 1) { int[] surrogates = { (int) Character.highSurrogate(cp), (int) Character.lowSurrogate(cp), }; int ixx = ix++; for (int sp : surrogates) { String spName = Character.getName(sp); x_utf16 = ""; x_utf8 = ""; try { x_utf16 = codePointToUTF16(sp); x_utf8 = codePointToUTF8(sp); } catch (UnsupportedEncodingException ex) { ex.printStackTrace(); } int sc = Character.charCount(sp); System.out.printf(FMT_002, ixx++, sc, sp, sp, ((long) sp & 0x00000000ffffffff), sp, new String(Character.toChars(sp)), x_utf16, x_utf8, spName); } } } return; } // --------------------------------------------------------------------------- public static String codePointToUTF8(int cp) throws UnsupportedEncodingException { String scp = new String(Character.toChars(cp)); boolean comma = false; StringBuilder xparts = new StringBuilder(); byte[] b_utf8 = scp.getBytes("UTF-8"); for (int xx = 0; xx < b_utf8.length; ++xx) { if (comma) { xparts.append(','); } xparts.append(String.format("%02x", b_utf8[xx])); comma = true; } return xparts.toString(); } // --------------------------------------------------------------------------- public static String codePointToUTF16(int cp) throws UnsupportedEncodingException { String scp = new String(Character.toChars(cp)); StringBuilder xparts = new StringBuilder(); byte[] b_utf16 = scp.getBytes("UTF-16BE"); boolean comma = false; for (int xx = 0; xx < b_utf16.length; xx += 2) { if (comma) { xparts.append(','); } xparts.append(String.format("%02x%02x", b_utf16[xx], b_utf16[xx + 1])); comma = true; } return xparts.toString(); } // --------------------------------------------------------------------------- public static void main(String[] args) { String inFile; if (args.length > 0 && args[0].length() > 0) { inFile = args[0]; } else { inFile = "./data/utf8-001.txt"; } RUTF8CharacterReader lcl = new RUTF8CharacterReader(); lcl.slurpChars(inFile); lcl.encodingDetails(); return; }
} </lang>
- Output:
file_encoding="UTF8" file_name="/Users/RosettaCode/java/./data/utf8-001.txt" unicode_string_length="10" code_point_count="8" string="y䮀𝄞𝄢12" codepoint_index="000" character_count="1" unicode_id="U+00079" hex="0x000079" dec="0000121" oct="0000171" string="y" utf-16="0079" utf-8="79" character_name="LATIN SMALL LETTER Y" codepoint_index="001" character_count="1" unicode_id="U+000E4" hex="0x0000e4" dec="0000228" oct="0000344" string="ä" utf-16="00e4" utf-8="c3,a4" character_name="LATIN SMALL LETTER A WITH DIAERESIS" codepoint_index="002" character_count="1" unicode_id="U+000AE" hex="0x0000ae" dec="0000174" oct="0000256" string="®" utf-16="00ae" utf-8="c2,ae" character_name="REGISTERED SIGN" codepoint_index="003" character_count="1" unicode_id="U+020AC" hex="0x0020ac" dec="0008364" oct="0020254" string="€" utf-16="20ac" utf-8="e2,82,ac" character_name="EURO SIGN" codepoint_index="004" character_count="2" unicode_id="U+1D11E" hex="0x01d11e" dec="0119070" oct="0350436" string="𝄞" utf-16="d834,dd1e" utf-8="f0,9d,84,9e" character_name="MUSICAL SYMBOL G CLEF" codepoint_index="004" character_count="1" unicode_id="U+0D834" hex="0x00d834" dec="0055348" oct="0154064" string="?" utf-16="fffd" utf-8="3f" character_name="HIGH SURROGATES D834" codepoint_index="005" character_count="1" unicode_id="U+0DD1E" hex="0x00dd1e" dec="0056606" oct="0156436" string="?" utf-16="fffd" utf-8="3f" character_name="LOW SURROGATES DD1E" codepoint_index="006" character_count="2" unicode_id="U+1D122" hex="0x01d122" dec="0119074" oct="0350442" string="𝄢" utf-16="d834,dd22" utf-8="f0,9d,84,a2" character_name="MUSICAL SYMBOL F CLEF" codepoint_index="006" character_count="1" unicode_id="U+0D834" hex="0x00d834" dec="0055348" oct="0154064" string="?" utf-16="fffd" utf-8="3f" character_name="HIGH SURROGATES D834" codepoint_index="007" character_count="1" unicode_id="U+0DD22" hex="0x00dd22" dec="0056610" oct="0156442" string="?" utf-16="fffd" utf-8="3f" character_name="LOW SURROGATES DD22" codepoint_index="008" character_count="1" unicode_id="U+00031" hex="0x000031" dec="0000049" oct="0000061" string="1" utf-16="0031" utf-8="31" character_name="DIGIT ONE" codepoint_index="009" character_count="1" unicode_id="U+00032" hex="0x000032" dec="0000050" oct="0000062" string="2" utf-16="0032" utf-8="32" character_name="DIGIT TWO"
NetRexx
Java and by extension NetRexx provides I/O functions that read UTF-8 encoded character data directly from an attached input stream. The Reader.read() method reads a single character as an integer value in the range 0 – 65535 [0x00 – 0xffff], reading from a file encoded in UTF-8 will read each codepoint into an int. In the sample below the readCharacters method reads the file character by character into a String and returns the result to the caller. The rest of this sample examines the result and formats the details.
- The file data/utf8-001.txt it a UTF-8 encoded text file containing the following: y䮀𝄞𝄢12.
<lang NetRexx>/* NetRexx */ options replace format comments java crossref symbols nobinary numeric digits 20
runSample(arg) return
-- ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ method readCharacters(fName) public static binary returns String
slurped = String() slrp = StringBuilder() fr = Reader null fFile = File(fName) EOF = int -1 -- End Of File indicator do fr = BufferedReader(FileReader(fFile)) ic = int cc = char -- read the contents of the file one character at a time loop label rdr forever -- Reader.read reads a single character as an integer value in the range 0 - 65535 [0x00 - 0xffff] -- or -1 on end of stream i.e. End Of File ic = fr.read() if ic == EOF then leave rdr cc = Rexx(ic).d2c slrp.append(cc) end rdr -- load the results of the read into a variable slurped = slrp.toString() catch fex = FileNotFoundException fex.printStackTrace() catch iex = IOException iex.printStackTrace() finally if fr \= null then do fr.close() catch iex = IOException iex.printStackTrace() end end return slurped
-- ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ method encodingDetails(str = String) public static
stlen = str.length() cplen = Character.codePointCount(str, 0, stlen) say 'Unicode: length="'stlen'" code_point_count="'cplen'" string="'str'"' loop ix = 0 to stlen - 1 cp = Rexx(Character.codePointAt(str, ix)) cc = Rexx(Character.charCount(cp)) say ' 'formatCodePoint(ix, cc, cp) if cc > 1 then do surrogates = [Rexx(Character.highSurrogate(cp)).c2d(), Rexx(Character.lowSurrogate(cp)).c2d()] loop sx = 0 to cc - 1 ix = ix + sx cp = surrogates[sx] say ' 'formatCodePoint(ix, 1, cp) end sx end end ix say return
-- ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ -- @see http://docs.oracle.com/javase/6/docs/technotes/guides/intl/encoding.doc.html -- @since Java 1.7 method formatCodePoint(ix, cc, cp) private static
scp = Rexx(Character.toChars(cp)) icp = cp.d2x(8).x2d(9) -- signed to unsigned conversion ocp = Rexx(Integer.toOctalString(icp)) x_utf16 = x_utf8 = do b_utf16 = String(scp).getBytes('UTF-16BE') b_utf8 = String(scp).getBytes('UTF-8') loop bv = 0 to b_utf16.length - 1 by 2 x_utf16 = x_utf16 Rexx(b_utf16[bv]).d2x(2) || Rexx(b_utf16[bv + 1]).d2x(2) end bv loop bv = 0 to b_utf8.length - 1 x_utf8 = x_utf8 Rexx(b_utf8[bv]).d2x(2) end bv x_utf16 = x_utf16.space(1, ',') x_utf8 = x_utf8.space(1, ',') catch ex = UnsupportedEncodingException ex.printStackTrace() end cpName = Character.getName(cp) fmt = - 'CodePoint:' - 'index="'ix.right(3, 0)'"' - 'character_count="'cc'"' - 'id="U+'cp.d2x(5)'"' - 'hex="0x'cp.d2x(6)'"' - 'dec="'icp.right(7, 0)'"' - 'oct="'ocp.right(7, 0)'"' - 'char="'scp'"' - 'utf-16="'x_utf16'"' - 'utf-8="'x_utf8'"' - 'name="'cpName'"' return fmt
-- ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ method runSample(arg) public static
parse arg fileNames if fileNames = then fileNames = 'data/utf8-001.txt' loop while fileNames \= parse fileNames fileName fileNames slurped = readCharacters(fileName) say "Input:" slurped encodingDetails(slurped) end say return
</lang>
- Output:
Input: y䮀𝄞𝄢12 Unicode: length="10" code_point_count="8" string="y䮀𝄞𝄢12" CodePoint: index="000" character_count="1" id="U+00079" hex="0x000079" dec="0000121" oct="0000171" char="y" utf-16="0079" utf-8="79" name="LATIN SMALL LETTER Y" CodePoint: index="001" character_count="1" id="U+000E4" hex="0x0000E4" dec="0000228" oct="0000344" char="ä" utf-16="00E4" utf-8="C3,A4" name="LATIN SMALL LETTER A WITH DIAERESIS" CodePoint: index="002" character_count="1" id="U+000AE" hex="0x0000AE" dec="0000174" oct="0000256" char="®" utf-16="00AE" utf-8="C2,AE" name="REGISTERED SIGN" CodePoint: index="003" character_count="1" id="U+020AC" hex="0x0020AC" dec="0008364" oct="0020254" char="€" utf-16="20AC" utf-8="E2,82,AC" name="EURO SIGN" CodePoint: index="004" character_count="2" id="U+1D11E" hex="0x01D11E" dec="0119070" oct="0350436" char="𝄞" utf-16="D834,DD1E" utf-8="F0,9D,84,9E" name="MUSICAL SYMBOL G CLEF" CodePoint: index="004" character_count="1" id="U+0D834" hex="0x00D834" dec="0055348" oct="0154064" char="?" utf-16="FFFD" utf-8="3F" name="HIGH SURROGATES D834" CodePoint: index="005" character_count="1" id="U+0DD1E" hex="0x00DD1E" dec="0056606" oct="0156436" char="?" utf-16="FFFD" utf-8="3F" name="LOW SURROGATES DD1E" CodePoint: index="006" character_count="2" id="U+1D122" hex="0x01D122" dec="0119074" oct="0350442" char="𝄢" utf-16="D834,DD22" utf-8="F0,9D,84,A2" name="MUSICAL SYMBOL F CLEF" CodePoint: index="006" character_count="1" id="U+0D834" hex="0x00D834" dec="0055348" oct="0154064" char="?" utf-16="FFFD" utf-8="3F" name="HIGH SURROGATES D834" CodePoint: index="007" character_count="1" id="U+0DD22" hex="0x00DD22" dec="0056610" oct="0156442" char="?" utf-16="FFFD" utf-8="3F" name="LOW SURROGATES DD22" CodePoint: index="008" character_count="1" id="U+00031" hex="0x000031" dec="0000049" oct="0000061" char="1" utf-16="0031" utf-8="31" name="DIGIT ONE" CodePoint: index="009" character_count="1" id="U+00032" hex="0x000032" dec="0000050" oct="0000062" char="2" utf-16="0032" utf-8="32" name="DIGIT TWO"
Perl
<lang perl>binmode STDOUT, ':utf8'; # so we can print wide chars without warning
open my $fh, "<:encoding(UTF-8)", "input.txt" or die "$!\n";
while (read $fh, my $char, 1) {
printf "got character $char [U+%04x]\n", ord $char;
}
close $fh;</lang>
If the contents of the input.txt file are aă€⼥
then the output would be:
got character a [U+0061] got character ă [U+0103] got character € [U+20ac] got character ⼥ [U+2f25]
Perl 6
Perl 6 has a built in method .getc to get a single character from an open file handle. File handles default to UTF-8, so they will handle multi-byte characters correctly.
To read a single character at a time from the Standard Input terminal; $*IN in Perl 6: <lang perl6>.say while defined $_ = $*IN.getc;</lang>
Or, from a file: <lang perl6>my $filename = 'whatever';
my $in = open( $filename, :r ) or die "$!\n";
print $_ while defined $_ = $in.getc;</lang>
Python
<lang python> with open(filename,"rb") as f:
while True: onebyte=f.read(1) if not onebyte: break byte=onebyte[0]
</lang>
Python 3 simplifies the handling of text files since you can specify an encoding. <lang python>def get_next_character(f):
"""Reads one character from the given textfile""" c = f.read(1) while c: yield c c = f.read(1)
- Usage:
with open("input.txt", encoding="utf-8") as f:
for c in get_next_character(f): print(c, sep="", end="")</lang>
Racket
Don't we all love self reference? <lang racket>
- lang racket
- This file contains utf-8 charachters
- λ, α, γ ...
(for ([c (in-port read-char (open-input-file "read-file.rkt"))])
(display c))
</lang> Output: <lang racket>
- lang racket
- This file contains utf-8 charachters
- λ, α, γ ...
(for ([c (in-port read-char (open-input-file "read-file.rkt"))])
(display c))
</lang>
REXX
version 1
REXX doesn't support UTF8 encoded wide characters, just bytes.
The task's requirement stated that EOF was to be returned upon reaching the end-of-file, so this programming example was written as a subroutine (procedure).
Note that displaying of characters that may modify screen behavior such as tab usage, backspaces, line feeds, carriage returns, "bells" and others are suppressed, but their hexadecimal equivalents are displayed.
<lang rexx>/*REXX pgm reads/shows a file char by char, returning 'EOF' when done. */
parse arg f . /* F is the fileID to be read.*/
/* [↓] show the file's contents.*/
if f\== then do j=1; x=getChar(f) /*J count's the file's characters*/
if x=='EOF' then do; say 'End-Of-File.'; leave; end y=; if x>>' ' then y=x /*display char X if presentable.*/ say right(j,20) 'character, (hex,char)' c2x(x) y end /*j*/ /* [↑] only show X if not low hex*/
exit /*stick a fork in it, we're done.*/ /*───────────────────────────────GETCHAR subroutine─────────────────────*/ getChar: procedure; parse arg z; if chars(z)==0 then return 'EOF'
return charin(z)</lang>
input file: ABC
and was created by the DOS command (under Windows/XP): echo [¬ a prime]> ABC
123 [¬ a prime]
output (for the above input file):
1 character, (hex,char) 31 1 2 character, (hex,char) 32 2 3 character, (hex,char) 33 3 4 character, (hex,char) 20 5 character, (hex,char) 5B [ 6 character, (hex,char) AA ¬ 7 character, (hex,char) 20 8 character, (hex,char) 61 a 9 character, (hex,char) 20 10 character, (hex,char) 70 p 11 character, (hex,char) 72 r 12 character, (hex,char) 69 i 13 character, (hex,char) 6D m 14 character, (hex,char) 65 e 15 character, (hex,char) 5D ] 16 character, (hex,char) 0D 17 character, (hex,char) 0A End-Of-File.
version 2
<lang rexx>/* REXX ---------------------------------------------------------------
- 29.12.2013 Walter Pachl
- read one utf8 character at a time
- see http://de.wikipedia.org/wiki/UTF-8#Kodierung
- --------------------------------------------------------------------*/
oid='utf8.txt';'erase' oid /* first create file containing utf8 chars*/ Call charout oid,'79'x Call charout oid,'C3A4'x Call charout oid,'C2AE'x Call charout oid,'E282AC'x Call charout oid,'F09D849E'x Call lineout oid fid='utf8.txt' /* then read it and show the contents */ Do Until c8='EOF'
c8=get_utf8char(fid) Say left(c8,4) c2x(c8) End
Exit
get_utf8char: Procedure
Parse Arg f If chars(f)=0 Then Return 'EOF' c=charin(f) b=c2b(c) If left(b,1)=0 Then Nop Else Do p=pos('0',b) Do i=1 To p-2 If chars(f)=0 Then Do Say 'illegal contents in file' f Leave End c=c||charin(f) End End Return c
c2b: Return x2b(c2x(arg(1)))</lang> output:
y 79 ä C3A4 ® C2AE € E282AC ð„ž F09D849E EOF 454F46
Ruby
Utf-8 is the default encoding since Ruby 2.0. In Ruby 1.9 use the magic comment "#encoding: utf-8" on the first line. <lang ruby>DATA.each_char{|c| p c}
__END__ characters: λ, α, γ</lang>
Run BASIC
<lang runbasic>open file.txt" for binary as #f numChars = 1 ' specify number of characters to read a$ = input$(#f,numChars) ' read number of characters specified b$ = input$(#f,1) ' read one character close #f</lang>
Tcl
To read a single character from a file, use: <lang tcl>set ch [read $channel 1]</lang> This will read multiple bytes sufficient to obtain a Unicode character if a suitable encoding has been configured on the channel. For binary channels, this will always consume exactly one byte. However, the low-level channel buffering logic may consume more than one byte (which only really matters where the channel is being handed on to another process and the channel is over a file descriptor that doesn't support the lseek OS call); the extent of buffering can be controlled via: <lang tcl>fconfigure $channel -buffersize $byteCount</lang> When the channel is only being accessed from Tcl (or via Tcl's C API) it is not normally necessary to adjust this option.