Word frequency: Difference between revisions
Content added Content deleted
(→{{header|zkl}}: rewrite) |
(→version 1: added support for most accented letters, words that contain an apostrophe, optimized the reading of the file to support non-Latin letters, added verbiage to the REXX's section header and output section.) |
||
Line 236: | Line 236: | ||
This REXX version doesn't need to sort the list of words. |
This REXX version doesn't need to sort the list of words. |
||
Currently, this version |
Currently, this version recognizes all the accented (non-Latin) accented letters that are present in the text that is specified to be used (and some other non-Latin letters as well). |
||
<lang rexx>/*REXX program reads and displays a count of words a file. Word case is ignored.*/ |
|||
This version also supports words that contain embedded apostrophes (<b><big>''' ' '''</big></b>) [that is, within a word, but not those words that start or end with an apostrophe, for those words, the apostrophe is elided). |
|||
Thus, ''' it's ''' is counted separately from '''it''' or ''' its'''. |
|||
<lang rexx>/*REXX pgm displays top 10 words in a file (includes foreign letters), case is ignored.*/ |
|||
parse arg fID top . /*obtain optional arguments from the CL*/ |
parse arg fID top . /*obtain optional arguments from the CL*/ |
||
if fID=='' | fID=="," then fID= 'les_mes.TXT' /*None specified? Then use the default.*/ |
if fID=='' | fID=="," then fID= 'les_mes.TXT' /*None specified? Then use the default.*/ |
||
if top=='' | top=="," then top= 10 /* " " " " " " */ |
if top=='' | top=="," then top= 10 /* " " " " " " */ |
||
c=0; |
c=0; @.=0; abcL="abcdefghijklmnopqrstuvwxyz'" /*initialize word list, count; alphabet*/ |
||
q= "'"; abcU= abcL; upper abcU /*define uppercase version of alphabet*/ |
|||
accL= 'üéâÄàÅÇêëèïîìéæôÖòûùÿáíóúÑ' /* " " of some accented chrs*/ |
|||
accU= 'ÜéâäàåçêëèïîìÉÆôöòûùÿáíóúñ' /* " lowercase accented characters.*/ |
|||
⚫ | |||
accG= 'αßΓπΣσµτΦΘΩδφε' /* " some lower/upper Greek letters*/ |
|||
a=abcL || abcL ||accL ||accL || accG /* " char string of after letters.*/ |
|||
b=abcL || abcU ||accL ||accU || accG || xrange() /* " char string of before " */ |
|||
x= 'Çà åå çÇ êÉ ëÉ áà óâ ªæ ºç ¿è ⌐é ¬ê ½ë «î »ï ▒ñ ┤ô ╣ù ╗û ╝ü' /*list of 16-bit chars.*/ |
|||
⚫ | |||
xs= words(d) /*num. " " " */ |
|||
⚫ | |||
!.= /* " the original word instances. */ |
|||
do #=1 while lines(fID)\==0; $=linein(fID) /*loop whilst there are lines in file. */ |
|||
if pos('├', $)\==0 then do k=1 for xs; _=word(x, k) /*any 16-bit chars? */ |
|||
$=changestr('├'left(_, 1), $, right(_, 1) ) /*convert.*/ |
|||
⚫ | |||
⚫ | |||
do while $\=''; parse var $ z $ /*now, process each word in the $ list.*/ |
|||
if left(z, 1)==q then z=substr(z, 2) /*starts with an apostrophe?*/ |
|||
if right(z, 1)==q then z=left(z, length(z) - 1) /*ends " " " */ |
|||
if z='' then iterate |
|||
if @.z==0 then do; c=c+1; !.c=z; end /*bump word count; assign word to array*/ |
if @.z==0 then do; c=c+1; !.c=z; end /*bump word count; assign word to array*/ |
||
@@.z= |
@@.z=z /*save the original case of the word. */ |
||
@.z=@.z + 1 /*bump the count of occurrences of word*/ |
@.z=@.z + 1 /*bump the count of occurrences of word*/ |
||
end /*while*/ |
end /*while*/ |
||
end /*#*/ |
end /*#*/ |
||
say right('word',40) " " center(' rank ',6) " count " /*display |
say right('word', 40) " " center(' rank ', 6) " count " /*display title for output*/ |
||
say right('════',40) " " center('══════',6) "═══════" /* " |
say right('════', 40) " " center('══════', 6) "═══════" /* " title separator.*/ |
||
do tops=1 by 0 until otops==tops|tops>top /*process enough words to satisfy TOP.*/ |
do tops=1 by 0 until otops==tops|tops>top /*process enough words to satisfy TOP.*/ |
||
Line 267: | Line 278: | ||
z=!.n /*get the name of the capitalized word.*/ |
z=!.n /*get the name of the capitalized word.*/ |
||
if count==mc then tl=tl z /*handle cases of tied number of words.*/ |
if count==mc then tl=tl z /*handle cases of tied number of words.*/ |
||
if count> |
if count> mc then do; mc=count /*this word count is the current max. */ |
||
tl=z /* " word " " " " */ |
tl=z /* " word " " " " */ |
||
end |
end |
||
Line 275: | Line 286: | ||
do d=1 for words(tl); _=word(tl, d) |
do d=1 for words(tl); _=word(tl, d) |
||
if d==1 then w=max(8, length(@._)) /*use the length of the first word used*/ |
if d==1 then w=max(8, length(@._)) /*use the length of the first word used*/ |
||
say right(@@._, 40 |
say right(@@._, 40) right(commas(tops), wr) right(commas(@._), w) |
||
@._=0 /*nullify this word count for next time*/ |
@._=0 /*nullify this word count for next time*/ |
||
end /*d*/ |
end /*d*/ |
||
tops=tops + words(tl) /*correctly handle the tied rankings. */ |
tops=tops + words(tl) /*correctly handle the tied rankings. */ |
||
end /*tops*/ |
end /*tops*/ |
||
⚫ | |||
/*──────────────────────────────────────────────────────────────────────────────────────*/ |
|||
commas: procedure; parse arg _; n=_'.9'; #=123456789; b=verify(n, #, "M") |
|||
e=verify(n, #'0', , verify(n, #"0.", 'M') ) - 4 |
|||
do j=e to b by -3; _=insert(',', _, j); end /*j*/; return _</lang> |
|||
{{out|output|text= when using the default inputs:}} |
{{out|output|text= when using the default inputs:}} |
||
This output agrees with '''UNIX Shell'''. |
|||
<pre> |
<pre> |
||
word rank count |
word rank count |
||
════ ══════ ═══════ |
════ ══════ ═══════ |
||
the 1 |
the 1 41,088 |
||
of 2 |
of 2 19,949 |
||
and 3 |
and 3 14,942 |
||
a 4 |
a 4 14,595 |
||
to 5 |
to 5 13,950 |
||
in 6 |
in 6 11,214 |
||
he 7 |
he 7 9,607 |
||
was 8 |
was 8 8,620 |
||
that 9 |
that 9 7,826 |
||
it 10 |
it 10 6,535 |
||
</pre> |
</pre> |
||
To see a list of the top 5,000 words that show (among other things) words like '''it's''' and other accented words, see the discussion page. <br><br> |
|||
===version 2=== |
===version 2=== |