Anonymous user
File size distribution: Difference between revisions
m
→{{header|Haskell}}
Line 391:
</pre>
=={{header|Haskell}}==
Uses a grouped frequency distribution. Program arguments are optional. Arguments include starting directory and initial frequency distribution group size. Distribution groups of 0 are removed. After the first frequency distribution is computed it further breaks it down for any group that exceeds 25% of the total file count.
<lang haskell>{-# LANGUAGE TupleSections, LambdaCase #-}
import Control.
import Control.Concurrent.Chan (Chan, newChan, readChan, writeChan, writeList2Chan)
import
import
import Data.List (sort, genericLength,
import
import System.
import
import System.FilePath.Posix (pathSeparator, (</>))
import System.IO (hFileSize, withFile, IOMode(ReadMode), FilePath)
import Text.Printf (printf)
data Item = File FilePath Integer
Line 420 ⟶ 423:
placeGroups [] gs = gs
placeGroups (d:ds) gs = placeGroups ds $
fmap (\g@((min,max), count) ->
if d >= min && d
then ((min, max), succ count)
else g) gs
fileSizes :: [Item] -> [Integer]
Line 429 ⟶ 433:
f (File _ n) acc = n : acc
f _ acc = acc
folders :: [Item] -> [FilePath]
folders = foldr f []
where
f (Folder p) acc = p:acc
f _ acc = acc
paths :: [Item] -> [FilePath]
paths = fmap (\case File p _ -> p
Folder p -> p)
totalBytes :: [Item] -> Integer
totalBytes = sum . fileSizes
counts :: [Item] -> (Integer, Integer)
counts =
foldr (\x (a, b) -> case x of File _ _ -> (succ a, b)
Folder _ -> (a, succ b)
) (0, 0)
groupsFromGroup :: [Integer] -> FrequencyGroup -> [FrequencyGroup]
groupsFromGroup fileSizes ((min, max), count) = frequencyGroups 10 range
where
collectBetween min max = filter (\n -> n >= min && n <= max)
range = collectBetween min max fileSizes
expandGroups :: [Integer] -> Integer -> [FrequencyGroup] -> [FrequencyGroup]
expandGroups fileSizes groupThreshold groups
| all ((<= groupThreshold) . snd) groups = groups
| otherwise = expandGroups fileSizes groupThreshold $ expand groups
where
expand = ((\g@((min, max), count) ->
if count > groupThreshold then
groupsFromGroup fileSizes g
else
[g]
) =<<)
displaySize :: Integer -> String
displaySize n
| n <= 2^10 = show n <> "B"
| n >= 2^10 && n <= 2^20 =
| n >= 2^20 && n <= 2^30 =
| n >= 2^30 && n <= 2^40 =
| n >= 2^40 && n <= 2^50 =
| otherwise = "Too large!"
where
folderWorker :: Chan FilePath -> Chan [Item] -> IO ()
folderWorker folderChan resultItemsChan =
forever (readChan folderChan >>= collectItems >>= writeChan resultItemsChan)
collectItems :: FilePath -> IO [Item]
collectItems
contents <- fmap (
files <- filterM doesFileExist contents
folders <- drop 2 <$> filterM doesDirectoryExist contents
displayFrequency :: Integer -> FrequencyGroup -> IO ()
displayFrequency filesCount ((min, max), count) =
printf "%5s <-> %5s = %
(displaySize min)
(displaySize max)
count
percentage
bars
where
percentage :: Double
percentage = (realToFrac count / realToFrac filesCount) * 100
bars = replicate (round percentage) '█'
parseArgs :: [String] -> Either String (FilePath, Int)
parseArgs (x:y:xs)
| all isDigit y = Right (x, read y)
| otherwise = Left "Invalid frequency group size"
parseArgs (x:xs) = Right (x, 4)
parseArgs _ = Right (".", 4)
parallelItemCollector :: FilePath -> IO [Item]
parallelItemCollector folder = do
wCount <- getNumProcessors
setNumCapabilities wCount
printf "Using %d worker threads\n" wCount
folderChan <- newChan
resultItemsChan <- newChan
replicateM_ wCount (forkIO $ folderWorker folderChan resultItemsChan)
loop folderChan resultItemsChan [Folder folder]
where
loop :: Chan FilePath -> Chan [Item] -> [Item] -> IO [Item]
loop folderChan resultItemsChan xs = do
let fs = folders xs
regularFolders <- filterM (pathIsSymbolicLink >=> (pure . not)) fs
if null regularFolders then pure []
else do
writeList2Chan folderChan regularFolders
childItems <- replicateM (length regularFolders) (readChan resultItemsChan)
result <- mapM (loop folderChan resultItemsChan) childItems
pure (join childItems <> join result)
main :: IO ()
Line 470 ⟶ 541:
Left errorMessage -> putStrLn errorMessage
Right (path, groupSize) -> do
items <-
-- mapM_ putStrLn $ paths items
let (fileCount, folderCount) = counts items
printf "Total files: %d\n" fileCount
Line 476 ⟶ 548:
printf "Total size: %s\n" $ displaySize $ totalBytes items
putStrLn "\nDistribution:"
deepResults = expandedGroups (sizes items) (groupThreshold fileCount) results
mapM_ (displayFrequency fileCount) deepResults
where
filteredGroups n = filter ((>0) . snd) . frequencyGroups n . sizes
groupThreshold = round . (*0.25) . realToFrac
expandedGroups sizes n = filter ((>0) . snd) . expandGroups sizes n</lang>
{{out}}
<pre style="height: 50rem;">$ filedist
Using 4 worker threads
Total
Total folders: 65528
Total size: 6GB
Distribution:
0B <->
84B <-> 167B = 40942 9.48%: █████████
168B <-> 251B = 24867 5.76%: ██████
252B <-> 335B = 20019 4.64%: █████
336B <-> 419B = 15623 3.62%: ████
420B <-> 503B = 13403 3.10%: ███
504B <-> 587B = 12778 2.96%: ███
588B <-> 671B = 12125 2.81%: ███
672B <-> 755B = 12736 2.95%: ███
756B <-> 839B = 9565 2.22%: ██
826B <-> 2KB = 83110 19.25%: ███████████████████
2KB <-> 2KB = 34092 7.90%: ████████
2KB <-> 3KB = 20814 4.82%: █████
3KB <-> 4KB = 15088 3.49%: ███
4KB <-> 5KB = 10327 2.39%: ██
5KB <-> 6KB = 7608 1.76%: ██
6KB <-> 6KB = 6260 1.45%: █
6KB <-> 7KB = 4562 1.06%: █
7KB <-> 8KB = 3894 0.90%: █
8KB <-> 16KB = 18833 4.36%: ████
16KB <-> 24KB = 6188 1.43%: █
24KB <-> 32KB = 3342 0.77%: █
32KB <-> 40KB = 2100 0.49%:
40KB <-> 48KB = 1447 0.34%:
48KB <-> 56KB = 966 0.22%:
56KB <-> 64KB = 726 0.17%:
64KB <-> 72KB = 852 0.20%:
72KB <-> 81KB = 563 0.13%:
81KB <-> 161KB = 2368 0.55%: █
161KB <-> 242KB = 967 0.22%:
242KB <-> 322KB = 558 0.13%:
322KB <-> 403KB = 287 0.07%:
403KB <-> 483KB = 176 0.04%:
483KB <-> 564KB = 100 0.02%:
564KB <-> 644KB = 77 0.02%:
644KB <-> 725KB = 101 0.02%:
725KB <-> 805KB = 77 0.02%:
815KB <-> 2MB = 282 0.07%:
2MB <-> 2MB = 123 0.03%:
2MB <-> 3MB = 74 0.02%:
3MB <-> 4MB = 59 0.01%:
4MB <-> 5MB = 26 0.01%:
5MB <-> 6MB = 17 0.00%:
6MB <-> 6MB = 20 0.00%:
6MB <-> 7MB = 7 0.00%:
7MB <-> 8MB = 6 0.00%:
8MB <-> 16MB = 23 0.01%:
16MB <-> 24MB = 10 0.00%:
24MB <-> 32MB = 9 0.00%:
32MB <-> 40MB = 4 0.00%:
40MB <-> 49MB = 5 0.00%:
73MB <-> 81MB = 3 0.00%:
98MB <-> 196MB = 8 0.00%:
294MB <-> 392MB = 1 0.00%: </pre>
=={{header|Julia}}==
|