Word frequency: Difference between revisions
Content added Content deleted
Drkameleon (talk | contribs) (Added Arturo implementation) |
|||
Line 2,499: | Line 2,499: | ||
</pre> |
</pre> |
||
Using IORefs as values in the map seems to give a ~2x speedup on large files: |
|||
<lang haskell> |
|||
module Main where |
|||
import Data.Function ((&)) |
|||
Or, perhaps a little more simply: |
|||
import Data.Foldable (traverse_) |
|||
import Control.Monad (foldM, when, forM_) |
|||
import Data.Char (isSpace, toLower, isAlpha) |
|||
import Data.List (sortOn, foldl', filter) |
|||
import Data.Ord (Down(..)) |
|||
import System.IO (stdin, IOMode(..), openFile, hClose) |
|||
import System.Environment (getArgs) |
|||
import Data.IORef (IORef(..), newIORef, readIORef, modifyIORef') |
|||
-- containers |
|||
import Data.HashMap.Strict (HashMap) |
|||
import qualified Data.HashMap.Strict as M |
|||
-- text |
|||
import Data.Text (Text) |
|||
import qualified Data.Text as T |
|||
import qualified Data.Text.IO as T |
|||
frequencies :: [Text] -> IO (HashMap Text (IORef Int)) |
|||
frequencies = foldM (flip (M.alterF alter)) M.empty |
|||
where |
|||
alter Nothing = Just <$> newIORef (1 :: Int) |
|||
alter (Just ref) = modifyIORef' ref (+ 1) >> return (Just ref) |
|||
main :: IO () |
|||
main = do |
|||
args <- getArgs |
|||
(n,hand,filep) <- case length args of |
|||
0 -> return (10,stdin,False) |
|||
1 -> return (read $ head args,stdin,False) |
|||
_ -> let (ns:fp:_) = args |
|||
in fmap (\h -> (read ns,h,True)) (openFile fp ReadMode) |
|||
T.hGetContents hand >>= \contents -> do |
|||
freqtable <- frequencies $ filter (not . T.null) $ T.split isSpace $ T.map toLower contents |
|||
counts <- |
|||
let readRef (w, ref) = do |
|||
cnt <- readIORef ref |
|||
return (w, cnt) |
|||
in M.toList freqtable |
|||
& mapM readRef |
|||
print $ take 10 $ sortOn (Down . snd) counts |
|||
when filep (hClose hand) |
|||
</lang> |
|||
{{Out}} |
|||
<pre> |
|||
$ ./word_count 10 < ~/doc/les_miserables* |
|||
[("the",40378),("of",19869),("and",14468),("a",14278),("to",13590),("in",11025),("he",9213),("was",8347),("that",7249),("his",6414)] |
|||
</pre> |
|||
Or, perhaps a little more simply, though not streaming (will read everything into memory, don't use on big files): |
|||
<lang haskell>import qualified Data.Text.IO as T |
<lang haskell>import qualified Data.Text.IO as T |
||
import qualified Data.Text as T |
import qualified Data.Text as T |