CloudFlare suffered a massive security issue affecting all of its customers, including Rosetta Code. All passwords not changed since February 19th 2017 have been expired, and session cookie longevity will be reduced until late March.--Michael Mol (talk) 05:15, 25 February 2017 (UTC)

Find duplicate files

From Rosetta Code
Find duplicate files is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.
In a large directory structure it is easy to inadvertently leave unnecessary copies of files around, which can use considerable disk space and create confusion. Create a program which, given a minimum size and a folder/directory, will find all files of at least size bytes with duplicate contents under the directory and output or show the sets of duplicate files in order of decreasing size.

The program may be command-line or graphical, and duplicate content may be determined by direct comparison or by calculating a hash of the data. Specify which filesystems or operating systems your program works with if it has any filesystem- or OS-specific requirements. Identify hard links (filenames referencing the same content) in the output if applicable for the filesystem. For extra points detect when whole directory sub-trees are identical, or optionally remove or link identical files.

Elixir[edit]

Translation of: Ruby
defmodule Files do
def find_duplicate_files(dir) do
IO.puts "\nDirectory : #{dir}"
File.cd!(dir, fn ->
Enum.filter(File.ls!, fn fname -> File.regular?(fname) end)
|> Enum.group_by(fn file -> File.stat!(file).size end)
|> Enum.filter(fn {_, files} -> length(files)>1 end)
|> Enum.each(fn {size, files} ->
Enum.group_by(files, fn file -> :erlang.md5(File.read!(file)) end)
|> Enum.filter(fn {_, files} -> length(files)>1 end)
|> Enum.each(fn {_md5, fs} ->
IO.puts " --------------------------------------------"
Enum.each(fs, fn file ->
IO.puts " #{inspect File.stat!(file).mtime}\t#{size} #{file}"
end)
end)
end)
end)
end
end
 
hd(System.argv) |> Files.find_duplicate_files
Output:
C:\Elixir>elixir find_dup_file.exs \Windows\System32

Directory : \Windows\System32
  --------------------------------------------
  {{2009, 7, 14}, {1, 0, 32}}   31548  perfd009.dat
  {{2010, 11, 21}, {7, 14, 4}}  31548  perfd011.dat
  --------------------------------------------
  {{2015, 4, 29}, {18, 21, 50}} 5120  msdxm.ocx
  {{2015, 4, 29}, {18, 21, 50}} 5120  dxmasf.dll
  --------------------------------------------
  {{2010, 11, 21}, {3, 23, 55}} 91648  mapi32.dll
  {{2010, 11, 21}, {3, 23, 55}} 91648  mapistub.dll
  --------------------------------------------
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcp110_clr0400.dll
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcr100_clr0400.dll
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcr110_clr0400.dll

Haskell[edit]

- checks for wrong command line input (not existing directory / negative size)
- works on Windows as well as Unix Systems (tested with Mint 17 / Windows 7)
 
import Crypto.Hash.MD5 (hash)
import Data.ByteString as BS (readFile, ByteString())
import System.Environment (getArgs, getProgName)
import System.Directory (doesDirectoryExist, getDirectoryContents)
import System.FilePath.Posix ((</>))
import Control.Monad (forM)
import Text.Printf (printf)
import System.IO (withFile, IOMode(ReadMode), hFileSize)
 
 
type File = (BS.ByteString, -- md5hash
FilePath) -- filepath
 
type FileSize = Integer
 
getRecursiveContents :: FilePath -> FileSize -> IO [File]
getRecursiveContents curDir maxsize = do
names <- getDirectoryContents curDir
let dirs = filter (`notElem` [".", ".."]) names
files <- forM dirs $ \path -> do
let path' = curDir </> path
exists <- doesDirectoryExist path'

if exists
then getRecursiveContents path' maxsize
else genFileHash path'
maxsize
return $ concat files
 
 
genFileHash :: FilePath -> FileSize -> IO [File]
genFileHash path maxsize = do
size <- withFile path ReadMode hFileSize
if size <= maxsize
then BS.readFile path >>= \bs -> return [(hash bs, path)]
else return []
 
findDuplicates :: FilePath -> FileSize -> IO ()
findDuplicates dir bytes = do
exists <- doesDirectoryExist dir
if exists
then getRecursiveContents dir bytes >>= findSameHashes
else printf "Sorry, the directory \"%s\" does not exist...\n" dir
 
findSameHashes :: [File] -> IO ()
findSameHashes [] = return ()
findSameHashes ((hash, fp):xs) = do
case lookup hash xs of
(Just dupFile) -> printf "===========================\n\
\Found duplicate:\n\
\=> %s \n\
\=> %s \n\n"
fp dupFile
>> findSameHashes xs
(_) -> findSameHashes xs
 
main :: IO ()
main = do
args <- getArgs
case args of
[dir, mbytes] | [(bytes ,"")] <- reads mbytes
, bytes >= 1 -> findDuplicates dir bytes
(_) -> do
name <- getProgName
printf "Something went wrong - please use ./%s <dir> <bytes>\n" name
 
 

Example output:

$./finddups ~/Documents/MyGit/Haskell/ 20000
===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/.git/logs/HEAD 
=> /home/rewrite/Documents/MyGit/Haskell/.git/logs/refs/heads/master 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/.git/refs/remotes/origin/master 
=> /home/rewrite/Documents/MyGit/Haskell/.git/refs/heads/master 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/RosettaCode/Find-duplicate-files/sampletext.txt 
=> /home/rewrite/Documents/MyGit/Haskell/RosettaCode/otherdup.txt 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/RWH/FileManipulation/toupper-imp.hs 
=> /home/rewrite/Documents/MyGit/Haskell/RWH/FileManipulation/toupper-imp.hs~ 


$./finddups /home/rewrite/NotExistingDir 200000
Sorry, the directory "/home/rewrite/NotExistingDir" does not exist...


$./finddups /home/rewrite/ -100
Something went wrong - please use ./finddups <dir> <bytes>

Python[edit]

from __future__ import print_function
import os
import hashlib
import datetime
 
def FindDuplicateFiles(pth, minSize = 0, hashName = "md5"):
knownFiles = {}
 
#Analyse files
for root, dirs, files in os.walk(pth):
for fina in files:
fullFina = os.path.join(root, fina)
isSymLink = os.path.islink(fullFina)
if isSymLink:
continue # Skip symlinks
si = os.path.getsize(fullFina)
if si < minSize:
continue
if si not in knownFiles:
knownFiles[si] = {}
h = hashlib.new(hashName)
h.update(open(fullFina, "rb").read())
hashed = h.digest()
if hashed in knownFiles[si]:
fileRec = knownFiles[si][hashed]
fileRec.append(fullFina)
else:
knownFiles[si][hashed] = [fullFina]
 
#Print result
sizeList = list(knownFiles.keys())
sizeList.sort(reverse=True)
for si in sizeList:
filesAtThisSize = knownFiles[si]
for hashVal in filesAtThisSize:
if len(filesAtThisSize[hashVal]) < 2:
continue
fullFinaLi = filesAtThisSize[hashVal]
print ("=======Duplicate=======")
for fullFina in fullFinaLi:
st = os.stat(fullFina)
isHardLink = st.st_nlink > 1
infoStr = []
if isHardLink:
infoStr.append("(Hard linked)")
fmtModTime = datetime.datetime.utcfromtimestamp(st.st_mtime).strftime('%Y-%m-%dT%H:%M:%SZ')
print (fmtModTime, si, os.path.relpath(fullFina, pth), " ".join(infoStr))
 
if __name__=="__main__":
 
FindDuplicateFiles('/home/tim/Dropbox', 1024*1024)
 

Racket[edit]

 
#lang racket
 
(struct F (name id size [links #:mutable]))
 
(require openssl/sha1)
(define (find-duplicate-files path size)
(define Fs
(sort
(fold-files
(λ(path type acc)
(define s (and (eq? 'file type) (file-size path)))
(define i (and s (<= size s) (file-or-directory-identity path)))
(define ln (and i (findf (λ(x) (equal? i (F-id x))) acc)))
(when ln (set-F-links! ln (cons (path->string path) (F-links ln))))
(if (and i (not ln)) (cons (F path i s '()) acc) acc))
'() path #f)
> #:key F-size))
(define (find-duplicates Fs)
(define t (make-hash))
(for ([F Fs])
(define cksum (call-with-input-file (F-name F) sha1))
(hash-set! t cksum (cons F (hash-ref t cksum '()))))
(for/list ([(n Fs) (in-hash t)] #:unless (null? (cdr Fs))) Fs))
(let loop ([Fs Fs])
(if (null? Fs) '()
(let-values ([(Fs Rs)
(splitf-at Fs (λ(F) (= (F-size F) (F-size (car Fs)))))])
(append (find-duplicates Fs)
(loop Rs))))))
 
(define (show-duplicates path size)
(for ([Fs (find-duplicate-files path size)])
(define (links F)
(if (null? (F-links F)) ""
(format " also linked at ~a" (string-join (F-links F) ", "))))
(printf "~a (~a)~a\n" (F-name (car Fs)) (F-size (car Fs)) (links (car Fs)))
(for ([F (cdr Fs)]) (printf " ~a~a\n" (F-name F) (links F)))))
 
(show-duplicates (find-system-path 'home-dir) 1024)
 

REXX[edit]

bare bones version[edit]

This REXX version works with DOS   (with or without Microsoft Windows).
Note that the   tFID   (temp)   file is hard coded to the   C:   drive.
Only minimal error checking is performed.

/*REXX program to reads a (DOS) directory  and  finds and displays files that identical.*/
sep=center(' files are identical in size and content: ',79,"═") /*define the header. */
tFID= 'c:\TEMP\FINDDUP.TMP' /*use this as a temporary FileID. */
arg maxSize aDir /*obtain optional arguments from the CL*/
if maxSize='' | maxSize="," then maxSize=1000000 /*filesize limit (in bytes) [1 million]*/
aDir=strip(aDir) /*remove any leading or trailing blanks*/
if right(aDir,1)\=='\' then aDir=aDir"\" /*possibly add a trailing backslash [\]*/
"DIR" aDir '/a-d-s-h /oS /s | FIND "/" >' tFID /*the (DOS) DIR output ───► temp file. */
pFN= /*the previous filename and filesize. */
pSZ=; do j=0 while lines(tFID)\==0 /*process each of the files in the list*/
aLine=linein(tFID) /*obtain (DOS) DIR's output about a FID*/
parse var aLine . . sz fn /*obtain the filesize and its fileID. */
sz=space(translate(sz,,','),0) /*elide any commas from the size number*/
if sz>maxSize then leave /*Is the file > maximum? Ignore file. */
/* [↓] files identical? (1st million)*/
if sz==pSZ then if charin(aDir||pFN,1,sz)==charin(aDir||FN,1,sz) then do
say sep
say pLine
say aLine
say
end
pSZ=sz; pFN=FN; pLine=aLine /*remember the previous stuff for later*/
end /*j*/
 
if lines(tFID)\==0 then 'ERASE' tFID /*do housecleaning (delete temp file).*/
/*stick a fork in it, we're all done. */

output   when using (checking) with the default root directory:

══════════════════ files are identical in size and content: ═══════════════════
04/13/2013  19:13                76 another.BK
04/13/2013  19:13                76 another.A

══════════════════ files are identical in size and content: ═══════════════════
04/13/2013  17:15               244 gettfid.1
04/13/2013  17:15               244 junk.1

══════════════════ files are identical in size and content: ═══════════════════
03/03/1995  01:46            10,897 $ERR.BK
03/03/1995  01:46            10,897 $ERR.ORI

with error checking[edit]

This version of the REXX program:

  •   checks to see if running under the   DOS   environment
  •   uses the   TEMP   folder for storing a temporary file
  •   verifies that the   maxSize   is a positive integer
  •   adjusts the name for a generic file specification
  •   uses variables for some command names and command options
  •   shows the number of files examined and also the directory name
/*REXX program to reads a (DOS) directory  and  finds and displays files that identical.*/
sep=center(' files are identical in size and content: ',79,"═") /*define the header. */
parse arg !; if !all(arg()) then exit /*boilerplate HELP(?)*/
signal on halt; signal on novalue; signal on syntax /*handle exceptions, */
 
if \!dos then call err 'this program requires the DOS [environment].'
call getTFID /*defines a temporary File ID for DOS.*/
arg maxSize aDir /*obtain optional arguments from the CL*/
if maxSize='' | maxSize="," then maxSize=1000000 /*filesize limit (in bytes) [1 million]*/
if \isInt(maxSize) then call err "maxSize isn't an integer:" maxSize
if maxSize<0 then call err "maxSize can't be negative:" maxSize
if maxSize=0 then call err "maxSize can't be zero:" maxSize
aDir=strip(aDir) /*remove any leading or trailing blanks*/
if right(aDir,3)=='*.*' then aDir=substr(aDir,1,length(aDir)-3) /*adjust the dir name.*/
if right(aDir,1)\=='\' then aDir=aDir"\" /*possibly add a trailing backslash [\]*/
@dir = 'DIR' /*literal for the (DOS) DIR command. */
@dirNots= '/a-d-s-h' /*ignore DIRs, SYSTEM, and HIDDEN files*/
@dirOpts= '/oS /s' /*sort DIR's (+ subdirs) files by size.*/
@filter = '| FIND "/"' /*the "lines" must have a slash [/]. */
@erase = 'ERASE' /*literal for the (DOS) ERASE command.*/
@dir aDir @dirNots @dirOpts @filter '>' tFID /*(DOS) DIR output ──► temporary file.*/
pFN= /*the previous filename and filesize. */
pSZ=; do j=0 while lines(tFID)\==0 /*process each of the files in the list*/
aLine=linein(tFID) /*obtain (DOS) DIR's output about a FID*/
parse var aLine . . sz fn /*obtain the filesize and its fileID. */
sz=space(translate(sz,,','),0) /*elide any commas from the size number*/
if sz>maxSize then leave /*Is the file > maximum? Ignore file. */
/* [↓] files identical? (1st million)*/
if sz==pSZ then if charin(aDir||pFN,1,sz)==charin(aDir||FN,1,sz) then do
say sep
say pLine
say aLine
say
end
pSZ=sz; pFN=FN; pLine=aLine /*remember the previous stuff for later*/
end /*j*/
 
say j 'file's(j) "examined in" aDir /*show information to the screen.*/
if lines(tFID)\==0 then 'ERASE' tFID /*do housecleaning (delete temp file).*/
exit /*stick a fork in it, we're all done. */
/*═════════════════════════════general 1─line subs══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════*/
!all:  !!=!;!=space(!);upper !;call !fid;!nt=right(!var('OS'),2)=="NT";!cls=word('CLS VMFCLEAR CLRSCREEN',1+!cms+!tso*2);if arg(1)\==1 then return 0;if wordpos(!,"? ?SAMPLES ?AUTHOR ?FLOW")==0 then return 0;!call=']$H';call "$H" !fn !;!call=;return 1
!cal: if symbol('!CALL')\=="VAR" then !call=; return !call
!env:  !env='ENVIRONMENT'; if !sys=="MSDOS" | !brexx | !r4 | !roo then !env='SYSTEM'; if !os2 then !env="OS2"!env; !ebcdic=1=='f1'x; return
!fid: parse upper source !sys !fun !fid . 1 . . !fn !ft !fm .; call !sys; if !dos then do; _=lastpos('\',!fn); !fm=left(!fn,_); !fn=substr(!fn,_+1); parse var !fn !fn "." !ft; end; return word(0 !fn !ft !fm, 1+('0'arg(1)))
!rex: parse upper version !ver !vernum !verdate .; !brexx='BY'==!vernum; !kexx="KEXX"==!ver; !pcrexx='REXX/PERSONAL'==!ver | "REXX/PC"==!ver; !r4='REXX-R4'==!ver; !regina="REXX-REGINA"==left(!ver,11); !roo='REXX-ROO'==!ver; call !env; return
!sys:  !cms=!sys=='CMS'; !os2=!sys=="OS2"; !tso=!sys=='TSO' | !sys=="MVS"; !vse=!sys=='VSE'; !dos=pos("DOS",!sys)\==0|pos('WIN',!sys)\==0|!sys=="CMD"; call !rex; return
!var: call !fid; if !kexx then return space(dosenv(arg(1))); return space(value(arg(1),,!env))
err: say; say; say center(' error! ', 60, "*"); say; do j=1 for arg(); say arg(j); say; end; say; exit 13
getdTFID: tfid=p(!var("TMP") !var('TEMP') homedrive()"\"); if substr(tfid,2,1)==':'&substr(tfid,3,1)\=="\" then tfid=insert('\',t,2); return strip(tfid,"T",'\')"\"arg(1)'.'arg(2)
getTFID: if symbol('TFID')=="LIT" then tfid=; if tfid\=='' then return tfid; gfn=word(arg(1) !fn,1);gft=word(arg(2) "TMP",1); tfid='TEMP';if !tso then tfid=gfn"."gft;if !cms then tfid=gfn','gft",A4";if !dos then tfid=getdTFID(gfn,gft);return tfid
halt: call err 'program has been halted.'
homedrive: if symbol('HOMEDRIVE')\=="VAR" then homedrive=p(!var('HOMEDRIVE') "C:"); return homedrive
isint: return datatype(arg(1),'W')
novalue: syntax: call err 'REXX program' condition("C") 'error',condition("D"),'REXX source statement (line' sigl"):",sourceline(sigl)
p: return word(arg(1),1)
s: if arg(1)==1 then return arg(3); return word(arg(2) 's',1)

output   when using the DIR (folder):   H:\#\REX

══════════════════ files are identical in size and content: ═══════════════════
05/11/2015  18:49               838 UPDATECF.BU
05/11/2015  18:49               838 UPDATECF.TXT

══════════════════ files are identical in size and content: ═══════════════════
03/23/2014  21:55             2,736 EMIRP.RX_
03/26/2014  10:44             2,736 EMIRP2.RX_

══════════════════ files are identical in size and content: ═══════════════════
05/30/2015  17:30             4,542 JUSTIFY.RX_
11/25/2013  06:33             4,542 JUSTIFY.KX_

══════════════════ files are identical in size and content: ═══════════════════
06/15/2014  23:36            13,935 $BLOCK.KX_
05/30/2015  17:28            13,935 $BLOCK.RX_

1568 files examined in H:\#\REX\

Ruby[edit]

It confirms once by the file size. When the same, it confirms a digest (md5).

require 'digest/md5'
 
def find_duplicate_files(dir)
puts "\nDirectory : #{dir}"
Dir.chdir(dir) do
file_size = Dir.foreach('.').select{|f| FileTest.file?(f)}.group_by{|f| File.size(f)}
file_size.each do |size, files|
next if files.size==1
files.group_by{|f| Digest::MD5.file(f).to_s}.each do |md5,fs|
next if fs.size==1
puts " --------------------------------------------"
fs.each{|file| puts " #{File.mtime(file)} #{size} #{file}"}
end
end
end
end
 
find_duplicate_files("/Windows/System32")

Sample Output:

Directory : /Windows/System32
  --------------------------------------------
  2016-02-09 18:56:09 +0900  5120  dxmasf.dll
  2016-02-09 18:56:09 +0900  5120  msdxm.ocx
  --------------------------------------------
  2015-11-14 08:09:16 +0900  91648  mapi32.dll
  2015-11-14 08:09:16 +0900  91648  mapistub.dll
  --------------------------------------------
  2015-11-05 20:34:06 +0900  18592  msvcp110_clr0400.dll
  2015-11-05 20:34:06 +0900  18592  msvcr100_clr0400.dll
  2015-11-05 20:34:06 +0900  18592  msvcr110_clr0400.dll
  --------------------------------------------
  2009-07-14 10:00:32 +0900  31548  perfd009.dat
  2010-11-21 16:14:04 +0900  31548  perfd011.dat

It checked the operation with MS Windows 7.

Sidef[edit]

It uses the portable File::Find module which means that it should work, virtually, on any platform.

# usage: sidef fdf.sf [size] [dir1] [...]
 
require('File::Find')
 
func find_duplicate_files(Block code, size_min=0, *dirs) {
var files = Hash()
%S<File::Find>.find(
Hash(
no_chdir => true,
wanted => func(arg) {
var file = File(arg)
file.is_file || return()
file.is_link && return()
var size = file.size
size >= size_min || return()
files{size} := [] << file
},
) => dirs...
)
 
files.values.each { |set|
set.len > 1 || next
var dups = Hash()
for i in (^set.end) {
for (var j = set.end; j > i; --j) {
if (set[i].compare(set[j]) == 0) {
dups{set[i]} := [] << set.pop_at(j++)
}
}
}
dups.each{ |k,v| code(k.to_file, v...) }
}
 
return()
}
 
var duplicates = Hash()
func collect(*files) {
duplicates{files[0].size} := [] << files
}
 
find_duplicate_files(collect, Num(ARGV.shift), ARGV...)
 
for k,v in (duplicates.sort_by { |k| -k.to_i }) {
say "=> Size: #{k}\n#{'~'*80}"
for files in v {
say "#{files.sort.join(%Q[\n])}\n#{'-'*80}"
}
}

Section of sample output:

% sidef fdf.sf 0 /tmp /usr/bin
=> Size: 5656
~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/bin/precat
/usr/bin/preunzip
/usr/bin/prezip
--------------------------
=> Size: 2305
~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/bin/gunzip
/usr/bin/uncompress
--------------------------
=> Size: 2
~~~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/a.txt
/tmp/b.txt
--------------------------
/tmp/m.txt
/tmp/n.txt
--------------------------

Tcl[edit]

Only known to work on Unix. Uses both inode number checking and content hashing to do duplicate detection.

Library: Tcllib (Package: fileutil)
Library: Tcllib (Package: md5)
package require fileutil
package require md5
 
proc finddupfiles {dir {minsize 1}} {
foreach fn [fileutil::find $dir] {
file lstat $fn stat
if {$stat(size) < $minsize} continue
dict lappend byino $stat(dev),$stat(ino) $fn
if {$stat(type) ne "file"} continue
set f [open $fn "rb"]
set content [read $f]
close $f
set md5 [md5::md5 -hex $content]
dict lappend byhash $md5 $fn
}
set groups {}
foreach group [dict values $byino] {
if {[llength $group] <= 1} continue
set gs [lsort $group]
dict set groups [lindex $gs 0] $gs
}
foreach group [dict values $byhash] {
if {[llength $group] <= 1} continue
foreach f $group {
if {[dict exists $groups $f]} {
dict set groups $f [lsort -unique \
[concat [dict get $groups $f] $group]]
unset group
break
}
}
if {[info exist group]} {
set gs [lsort $group]
dict set groups [lindex $gs 0] $gs
}
}
set masters {}
dict for {n g} $groups {
lappend masters [list $n [llength $g],$n]
}
set result {}
foreach p [lsort -decreasing -index 1 -dictionary $masters] {
set n [lindex $p 0]
lappend result $n [dict get $groups $n]
}
return $result
}
 
foreach {leader dupes} [finddupfiles {*}$argv] {
puts "$leader has duplicates"
set n 0
foreach d $dupes {
if {$d ne $leader} {
puts " dupe #[incr n]: $d"
}
}
}

Section of sample output:

./compat/zlib/zconf.h has duplicates
   dupe #1: ./compat/zlib/zconf.h.in
./compat/zlib/contrib/vstudio/vc10/zlib.rc has duplicates
   dupe #1: ./compat/zlib/contrib/vstudio/vc9/zlib.rc
./compat/zlib/contrib/delphi/zlibd32.mak has duplicates
   dupe #1: ./compat/zlib/contrib/pascal/zlibd32.mak

zkl[edit]

Uses the MsgHash dll so I don't have to read the entire file to hash it (the built in MD5 only hashes one blob, MshHash can hash a chunked blob).

I tried threading this but, even though it was over twice as fast, not really worth it.

File findDupFiles.zkl:

include(zkl.h.zkl);
const FLAGS=FILE.GLOB.IGNORE_CASE + FILE.GLOB.NO_DIRS;
var [const] MsgHash=Import("zklMsgHash");
var recurse=False, fileSpec, minSz=0, maxSz=(0).MAX;
 
argh:=Utils.Argh(
T("+R","R","Recurse into subdirectories, starting at <arg>",
fcn(arg){ recurse=arg }),
T("+minSz","","Only consider files larger than <arg>",
fcn(arg){ minSz=arg.toInt() }),
T("+maxSz","","Only consider files less than <arg>",
fcn(arg){ maxSz=arg.toInt() }),
);
 
argh.parse(vm.arglist);
try { fileSpec=argh.loners[0]; }
catch{
argh.usage("Find duplicate files");
System.exit(1);
}
 
fnames:=Data(0,String);
if (recurse) File.globular(recurse,fileSpec,True,FLAGS,fnames);
else File.glob(fileSpec,FLAGS).pump(fnames);
 
files:=Dictionary(); // (len:(name,name...), ...)
foreach fname in (fnames){
sz:=File.len(fname);
if(minSz<=sz<=maxSz) files.appendV(File.len(fname),fname);
}
 
//////////////////////// group files by size
files=files.pump(List,Void.Xplode,fcn(k,v){ v.len()>1 and v or Void.Skip });
println("Found %d groups of same sized files, %d files total.".fmt(files.len(),
files.apply("len").sum(0)));
 
if(not files) System.exit(); // no files found
 
buffer:=Data(0d100_000); // we'll resuse this buffer for hashing
hashes:=files.pump(List,'wrap(fnames){ // get the MD5 hash for each file
fnames.pump(List,'wrap(fname){
file,hash := File(fname,"rb"), MsgHash.toSink("MD5");
file.pump(buffer,hash); file.close();
return(hash.close(),fname); // -->( (hash,name), (hash,name) ... )
})
},T(Void.Write,Void.Write)); // flatten list of lists of lists to above
 
// Hash the file hashes, then scoop out the files with the same hash
buffer:=Dictionary();
files:=hashes.pump(Void,Void.Xplode,buffer.appendV)
.pump(List,Void.Xplode,fcn(k,v){ v.len()>1 and v or Void.Skip });
 
println("Found %d duplicate files:".fmt(files.apply("len").sum(0)));
foreach group in (files){ println(" ",group.concat(", ")) }
Output:
$ zkl findDupFiles.zkl
Find duplicate files
Options:
  --R (-R) <arg>: Recurse into subdirectories, starting at <arg>
  --maxSz <arg>: Only consider files less than <arg>
  --minSz <arg>: Only consider files larger than <arg>

$ zkl findDupFiles.zkl '*' 
Found 16 groups of same sized files, 34 files total.
Found 8 duplicate files:
   unixdict.txt, dict.txt, uuuu.txt
   zz.zkl, zzDup.zkl
   gooperf.dat, zklTmpFile082p1V, test.dat