Jump to content

File size distribution: Difference between revisions

m
Line 391:
</pre>
=={{header|Haskell}}==
Uses a grouped frequency distribution. Program arguments are optional. Arguments include starting directory and initial frequency distribution group size. Distribution groups of 0 are removed. After the first frequency distribution is computed it further breaks it down for any group that exceeds 25% of the total file count.
<lang haskell>{-# LANGUAGE TupleSections, LambdaCase #-}
 
import Control.Monad Concurrent (filterMforkIO, joinsetNumCapabilities)
import Control.Concurrent.Chan (Chan, newChan, readChan, writeChan, writeList2Chan)
import Data.Char (isDigit)
import DataControl.ListMonad (filterM, join, (sortreplicateM, genericLengthreplicateM_, genericTakeforever, (>=>))
import SystemData.DirectoryChar (getDirectoryContents, doesFileExist (isDigit)
import Data.List (sort, genericLength, genericTake, doesDirectoryExistsortBy)
import SystemGHC.EnvironmentConc (getArgsgetNumProcessors)
import System.FilePath.PosixDirectory (pathSeparatorgetDirectoryContents, (</>))doesFileExist
import System.IO (hFileSize, withFile , IOMode(ReadMode)doesDirectoryExist, FilePathpathIsSymbolicLink)
import TextSystem.Printf Environment (printfgetArgs)
import System.FilePath.Posix (pathSeparator, (</>))
import System.IO (hFileSize, withFile, IOMode(ReadMode), FilePath)
import Text.Printf (printf)
 
data Item = File FilePath Integer
Line 420 ⟶ 423:
placeGroups [] gs = gs
placeGroups (d:ds) gs = placeGroups ds $
fmap (\g@((min,max), count) -> if d >= min && d <= max
if d >= min && d then ((min,<= max), succ count)
then ((min, max), succ count)
else g) gs
else g) gs
 
fileSizes :: [Item] -> [Integer]
Line 429 ⟶ 433:
f (File _ n) acc = n : acc
f _ acc = acc
 
folders :: [Item] -> [FilePath]
folders = foldr f []
where
f (Folder p) acc = p:acc
f _ acc = acc
 
paths :: [Item] -> [FilePath]
paths = fmap (\case File p _ -> p
Folder p -> p)
 
totalBytes :: [Item] -> Integer
totalBytes = sum . fileSizes
 
counts :: [Item] -> (Integer, Integer)
counts =
foldr (\x (a, b) -> case x of File _ _ -> (succ a, b)
Folder _ -> (a, succ b)
) (0, 0)
 
groupsFromGroup :: [Integer] -> FrequencyGroup -> [FrequencyGroup]
groupsFromGroup fileSizes ((min, max), count) = frequencyGroups 10 range
where
collectBetween min max = filter (\n -> n >= min && n <= max)
range = collectBetween min max fileSizes
 
expandGroups :: [Integer] -> Integer -> [FrequencyGroup] -> [FrequencyGroup]
expandGroups fileSizes groupThreshold groups
| all ((<= groupThreshold) . snd) groups = groups
| otherwise = expandGroups fileSizes groupThreshold $ expand groups
where
expand = ((\g@((min, max), count) ->
if count > groupThreshold then
groupsFromGroup fileSizes g
else
[g]
) =<<)
 
displaySize :: Integer -> String
displaySize n
| n <= 2^10 = show n <> "B"
| n >= 2^10 && n <= 2^20 = fdisplay "KB" $ 2^10
| n >= 2^20 && n <= 2^30 = fdisplay "MB" $ 2^20
| n >= 2^30 && n <= 2^40 = fdisplay "GB" $ 2^30
| n >= 2^40 && n <= 2^50 = fdisplay "TB" $ 2^40
| otherwise = "Too large!"
where
fdisplay suffix = (<> suffix) . show . round . (realToFrac n /)
 
folderWorker :: Chan FilePath -> Chan [Item] -> IO ()
folderWorker folderChan resultItemsChan =
forever (readChan folderChan >>= collectItems >>= writeChan resultItemsChan)
 
collectItems :: FilePath -> IO [Item]
collectItems pfolderPath = do
contents <- fmap (pfolderPath </>) <$> getDirectoryContents pfolderPath
files <- filterM doesFileExist contents
folders <- drop 2 <$> filterM doesDirectoryExist contents
aitems <- mapM (\f -> File f <$> withFile f ReadMode hFileSize) files
bpure $ items <-> mapMfmap collectItemsFolder folders
pure $ a <> join b <> fmap Folder folders
 
displayFrequency :: Integer -> FrequencyGroup -> IO ()
displayFrequency filesCount ((min, max), count) =
printf "%5s <-> %5s = %d5d %5.2f%%: %-5s\n" (displaySize min) (displaySize max) count
(displaySize min)
(displaySize max)
count
percentage
bars
where
percentage :: Double
percentage = (realToFrac count / realToFrac filesCount) * 100
bars = replicate (round percentage) '█'
 
parseArgs :: [String] -> Either String (FilePath, Int)
parseArgs (x:y:xs)
| all isDigit y = Right (x, read y)
| otherwise = Left "Invalid frequency group size"
parseArgs (x:xs) = Right (x, 4)
parseArgs _ = Right (".", 4)
 
parallelItemCollector :: FilePath -> IO [Item]
parallelItemCollector folder = do
wCount <- getNumProcessors
setNumCapabilities wCount
printf "Using %d worker threads\n" wCount
folderChan <- newChan
resultItemsChan <- newChan
replicateM_ wCount (forkIO $ folderWorker folderChan resultItemsChan)
loop folderChan resultItemsChan [Folder folder]
where
loop :: Chan FilePath -> Chan [Item] -> [Item] -> IO [Item]
loop folderChan resultItemsChan xs = do
let fs = folders xs
regularFolders <- filterM (pathIsSymbolicLink >=> (pure . not)) fs
if null regularFolders then pure []
else do
writeList2Chan folderChan regularFolders
childItems <- replicateM (length regularFolders) (readChan resultItemsChan)
result <- mapM (loop folderChan resultItemsChan) childItems
pure (join childItems <> join result)
 
main :: IO ()
Line 470 ⟶ 541:
Left errorMessage -> putStrLn errorMessage
Right (path, groupSize) -> do
items <- collectItemsparallelItemCollector path
-- mapM_ putStrLn $ paths items
let (fileCount, folderCount) = counts items
printf "Total files: %d\n" fileCount
Line 476 ⟶ 548:
printf "Total size: %s\n" $ displaySize $ totalBytes items
putStrLn "\nDistribution:"
mapM_let displayFrequencyresults $= filteredGroups groupSize items
deepResults = expandedGroups (sizes items) (groupThreshold fileCount) results
mapM_ (displayFrequency fileCount) deepResults
where
countssizes ::= [Item]sort ->. (Int, Int)fileSizes
filteredGroups n = filter ((>0) . snd) . frequencyGroups n . sizes
counts =
groupThreshold = round . (*0.25) . realToFrac
foldr (\x (a, b) -> case x of File _ _ -> (succ a, b)
expandedGroups sizes n = filter ((>0) . snd) . expandGroups sizes n</lang>
Folder _ -> (a, succ b)
) (0, 0)
 
filteredGroups n = filter ((>0) . snd) . frequencyGroups n . sort . fileSizes</lang>
{{out}}
<pre style="height: 50rem;">$ filedist . 10
Using 4 worker threads
Total files: 431815
Total foldersfiles: 65549431798
Total folders: 65528
Total size: 6GB
 
Distribution:
0B <-> 39MB 83B = 43179943580 10.09%: ██████████
84B <-> 167B = 40942 9.48%: █████████
39MB <-> 78MB = 6
168B <-> 251B = 24867 5.76%: ██████
78MB <-> 118MB = 3
252B <-> 335B = 20019 4.64%: █████
118MB <-> 157MB = 5
336B <-> 419B = 15623 3.62%: ████
157MB <-> 196MB = 1
420B <-> 503B = 13403 3.10%: ███
353MB <-> 392MB = 1</pre>
504B <-> 587B = 12778 2.96%: ███
588B <-> 671B = 12125 2.81%: ███
672B <-> 755B = 12736 2.95%: ███
756B <-> 839B = 9565 2.22%: ██
826B <-> 2KB = 83110 19.25%: ███████████████████
2KB <-> 2KB = 34092 7.90%: ████████
2KB <-> 3KB = 20814 4.82%: █████
3KB <-> 4KB = 15088 3.49%: ███
4KB <-> 5KB = 10327 2.39%: ██
5KB <-> 6KB = 7608 1.76%: ██
6KB <-> 6KB = 6260 1.45%: █
6KB <-> 7KB = 4562 1.06%: █
7KB <-> 8KB = 3894 0.90%: █
8KB <-> 16KB = 18833 4.36%: ████
16KB <-> 24KB = 6188 1.43%: █
24KB <-> 32KB = 3342 0.77%: █
32KB <-> 40KB = 2100 0.49%:
40KB <-> 48KB = 1447 0.34%:
48KB <-> 56KB = 966 0.22%:
56KB <-> 64KB = 726 0.17%:
64KB <-> 72KB = 852 0.20%:
72KB <-> 81KB = 563 0.13%:
81KB <-> 161KB = 2368 0.55%: █
161KB <-> 242KB = 967 0.22%:
242KB <-> 322KB = 558 0.13%:
322KB <-> 403KB = 287 0.07%:
403KB <-> 483KB = 176 0.04%:
483KB <-> 564KB = 100 0.02%:
564KB <-> 644KB = 77 0.02%:
644KB <-> 725KB = 101 0.02%:
725KB <-> 805KB = 77 0.02%:
815KB <-> 2MB = 282 0.07%:
2MB <-> 2MB = 123 0.03%:
2MB <-> 3MB = 74 0.02%:
3MB <-> 4MB = 59 0.01%:
4MB <-> 5MB = 26 0.01%:
5MB <-> 6MB = 17 0.00%:
6MB <-> 6MB = 20 0.00%:
6MB <-> 7MB = 7 0.00%:
7MB <-> 8MB = 6 0.00%:
8MB <-> 16MB = 23 0.01%:
16MB <-> 24MB = 10 0.00%:
24MB <-> 32MB = 9 0.00%:
32MB <-> 40MB = 4 0.00%:
40MB <-> 49MB = 5 0.00%:
73MB <-> 81MB = 3 0.00%:
98MB <-> 196MB = 8 0.00%:
294MB <-> 392MB = 1 0.00%: </pre>
 
=={{header|Julia}}==
Anonymous user
Cookies help us deliver our services. By using our services, you agree to our use of cookies.