Word frequency: Difference between revisions

(Added Arturo implementation)
Line 2,499:
</pre>
 
Using IORefs as values in the map seems to give a ~2x speedup on large files:
<lang haskell>
module Main where
 
import Data.Function ((&))
Or, perhaps a little more simply:
import Data.Foldable (traverse_)
import Control.Monad (foldM, when, forM_)
import Data.Char (isSpace, toLower, isAlpha)
import Data.List (sortOn, foldl', filter)
import Data.Ord (Down(..))
import System.IO (stdin, IOMode(..), openFile, hClose)
import System.Environment (getArgs)
import Data.IORef (IORef(..), newIORef, readIORef, modifyIORef')
 
-- containers
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as M
 
-- text
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
 
frequencies :: [Text] -> IO (HashMap Text (IORef Int))
frequencies = foldM (flip (M.alterF alter)) M.empty
where
alter Nothing = Just <$> newIORef (1 :: Int)
alter (Just ref) = modifyIORef' ref (+ 1) >> return (Just ref)
 
main :: IO ()
main = do
args <- getArgs
(n,hand,filep) <- case length args of
0 -> return (10,stdin,False)
1 -> return (read $ head args,stdin,False)
_ -> let (ns:fp:_) = args
in fmap (\h -> (read ns,h,True)) (openFile fp ReadMode)
T.hGetContents hand >>= \contents -> do
freqtable <- frequencies $ filter (not . T.null) $ T.split isSpace $ T.map toLower contents
counts <-
let readRef (w, ref) = do
cnt <- readIORef ref
return (w, cnt)
in M.toList freqtable
& mapM readRef
print $ take 10 $ sortOn (Down . snd) counts
when filep (hClose hand)
</lang>
{{Out}}
<pre>
$ ./word_count 10 < ~/doc/les_miserables*
[("the",40378),("of",19869),("and",14468),("a",14278),("to",13590),("in",11025),("he",9213),("was",8347),("that",7249),("his",6414)]
</pre>
 
Or, perhaps a little more simply, though not streaming (will read everything into memory, don't use on big files):
<lang haskell>import qualified Data.Text.IO as T
import qualified Data.Text as T
Anonymous user