Read a file character by character/UTF8: Difference between revisions

Add NetRexx
m (→‎version 1: changed word choice in the section header comments. -- ~~~~)
(Add NetRexx)
Line 15:
b$ = input$(#f,1) ' read one character
close #f</lang>
 
=={{header|NetRexx}}==
[[Java]] and by extension [[NetRexx]] provides I/O functions that read UTF-8 encoded character data directly from an attached input stream.
The <tt>Reader.read()</tt> method reads a single character as an integer value in the range 0 &ndash; 65535 [0x00 &ndash; 0xffff], reading from a file encoded in UTF-8 will read each codepoint into an <tt>int</tt>.
In the sample below the <tt>readCharacters</tt> method reads the file character by character into a <tt>String</tt> and returns the result to the caller. The rest of this sample examines the result and formats the details.
 
The file <tt>data/utf8.txt</tt> it a UTF-8 encoded text file containing the following:&nbsp;&#x79;&#xE4;&#xAE;&#x20AC;&#x1D11E;&#x1D122;&#xC6;&#xE6;&#x1E9E;&#xDF;&#x31;&#x32;.
<lang NetRexx>/* NetRexx */
options replace format comments java crossref symbols nobinary
numeric digits 20
 
runSample(arg)
return
 
-- ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
method readCharacters(fName) public static binary returns String
slurped = String('')
slrp = StringBuilder()
fr = Reader null
fFile = File(fName)
EOF = int -1 -- End Of File indicator
do
fr = BufferedReader(FileReader(fFile))
ic = int
cc = char
-- read the contents of the file one character at a time
loop label rdr forever
-- Reader.read reads a single character as an integer value in the range 0 - 65535 [0x00 - 0xffff]
-- or -1 on end of stream i.e. End Of File
ic = fr.read()
if ic == EOF then leave rdr
cc = Rexx(ic).d2c
slrp.append(cc)
end rdr
-- load the results of the read into a variable
slurped = slrp.toString()
catch fex = FileNotFoundException
fex.printStackTrace()
catch iex = IOException
iex.printStackTrace()
finally
if fr \= null then do
fr.close()
catch iex = IOException
iex.printStackTrace()
end
end
return slurped
 
-- ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
method encodingDetails(str = String) public static
stlen = str.length()
cplen = Character.codePointCount(str, 0, stlen)
say 'Unicode: length="'stlen'" code_point_count="'cplen'" string="'str'"'
loop ix = 0 to stlen - 1
cp = Rexx(Character.codePointAt(str, ix))
cc = Rexx(Character.charCount(cp))
say ' 'formatCodePoint(ix, cc, cp)
if cc > 1 then do
surrogates = [Rexx(Character.highSurrogate(cp)).c2d(), Rexx(Character.lowSurrogate(cp)).c2d()]
loop sx = 0 to cc - 1
ix = ix + sx
cp = surrogates[sx]
say ' 'formatCodePoint(ix, 1, cp)
end sx
end
end ix
say
return
 
-- ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
-- @see http://docs.oracle.com/javase/6/docs/technotes/guides/intl/encoding.doc.html
-- @since Java 1.7
method formatCodePoint(ix, cc, cp) private static
scp = Rexx(Character.toChars(cp))
icp = cp.d2x(8).x2d(9) -- signed to unsigned conversion
ocp = Rexx(Integer.toOctalString(icp))
x_utf16 = ''
x_utf8 = ''
do
b_utf16 = String(scp).getBytes('UTF-16BE')
b_utf8 = String(scp).getBytes('UTF-8')
loop bv = 0 to b_utf16.length - 1 by 2
x_utf16 = x_utf16 Rexx(b_utf16[bv]).d2x(2) || Rexx(b_utf16[bv + 1]).d2x(2)
end bv
loop bv = 0 to b_utf8.length - 1
x_utf8 = x_utf8 Rexx(b_utf8[bv]).d2x(2)
end bv
x_utf16 = x_utf16.space(1, ',')
x_utf8 = x_utf8.space(1, ',')
catch ex = UnsupportedEncodingException
ex.printStackTrace()
end
cpName = Character.getName(cp)
fmt = -
'CodePoint:' -
'index="'ix.right(3, 0)'"' -
'character_count="'cc'"' -
'id="U+'cp.d2x(5)'"' -
'hex="0x'cp.d2x(6)'"' -
'dec="'icp.right(7, 0)'"' -
'oct="'ocp.right(7, 0)'"' -
'char="'scp'"' -
'utf-16="'x_utf16'"' -
'utf-8="'x_utf8'"' -
'name="'cpName'"'
return fmt
-- ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
method runSample(arg) public static
parse arg fileNames
if fileNames = '' then fileNames = 'data/utf8.txt'
loop while fileNames \= ''
parse fileNames fileName fileNames
slurped = readCharacters(fileName)
say "Input:" slurped
encodingDetails(slurped)
end
say
return
</lang>
{{out}}
<pre>
Input: y䮀𝄞𝄢12
Unicode: length="10" code_point_count="8" string="y䮀𝄞𝄢12"
CodePoint: index="000" character_count="1" id="U+00079" hex="0x000079" dec="0000121" oct="0000171" char="y" utf-16="0079" utf-8="79" name="LATIN SMALL LETTER Y"
CodePoint: index="001" character_count="1" id="U+000E4" hex="0x0000E4" dec="0000228" oct="0000344" char="ä" utf-16="00E4" utf-8="C3,A4" name="LATIN SMALL LETTER A WITH DIAERESIS"
CodePoint: index="002" character_count="1" id="U+000AE" hex="0x0000AE" dec="0000174" oct="0000256" char="®" utf-16="00AE" utf-8="C2,AE" name="REGISTERED SIGN"
CodePoint: index="003" character_count="1" id="U+020AC" hex="0x0020AC" dec="0008364" oct="0020254" char="€" utf-16="20AC" utf-8="E2,82,AC" name="EURO SIGN"
CodePoint: index="004" character_count="2" id="U+1D11E" hex="0x01D11E" dec="0119070" oct="0350436" char="𝄞" utf-16="D834,DD1E" utf-8="F0,9D,84,9E" name="MUSICAL SYMBOL G CLEF"
CodePoint: index="004" character_count="1" id="U+0D834" hex="0x00D834" dec="0055348" oct="0154064" char="?" utf-16="FFFD" utf-8="3F" name="HIGH SURROGATES D834"
CodePoint: index="005" character_count="1" id="U+0DD1E" hex="0x00DD1E" dec="0056606" oct="0156436" char="?" utf-16="FFFD" utf-8="3F" name="LOW SURROGATES DD1E"
CodePoint: index="006" character_count="2" id="U+1D122" hex="0x01D122" dec="0119074" oct="0350442" char="𝄢" utf-16="D834,DD22" utf-8="F0,9D,84,A2" name="MUSICAL SYMBOL F CLEF"
CodePoint: index="006" character_count="1" id="U+0D834" hex="0x00D834" dec="0055348" oct="0154064" char="?" utf-16="FFFD" utf-8="3F" name="HIGH SURROGATES D834"
CodePoint: index="007" character_count="1" id="U+0DD22" hex="0x00DD22" dec="0056610" oct="0156442" char="?" utf-16="FFFD" utf-8="3F" name="LOW SURROGATES DD22"
CodePoint: index="008" character_count="1" id="U+00031" hex="0x000031" dec="0000049" oct="0000061" char="1" utf-16="0031" utf-8="31" name="DIGIT ONE"
CodePoint: index="009" character_count="1" id="U+00032" hex="0x000032" dec="0000050" oct="0000062" char="2" utf-16="0032" utf-8="32" name="DIGIT TWO"
</pre>
 
=={{header|Perl 6}}==
Anonymous user