Find duplicate files: Difference between revisions

← Older edit

Find duplicate files (view source)

Revision as of 03:24, 24 April 2024

31,444 bytes added , 1 month ago

→‎{{header|jq}}: -R

Peak

2,458

edits

Revision as of 18:20, 9 October 2020 (view source) rosettacode>Gerard Schildberger m (added whitespace and a ;Task:.) ← Older edit		Latest revision as of 03:24, 24 April 2024 (view source) Peak (talk \| contribs) (→‎{{header\|jq}}: -R)
(13 intermediate revisions by 9 users not shown)
Line 14: For extra points, detect when whole directory sub-trees are identical, or optionally remove or link identical files. <br><br> =={{header\|C++}}== The code uses - xxhash_cpp downloaded from https://github.com/RedSpah/xxhash_cpp - boost filesystem, boost format, and boost iostreams Compiles on Windows10 and Linux. <syntaxhighlight lang="cpp"> #include<iostream> #include<string> #include<boost/filesystem.hpp> #include<boost/format.hpp> #include<boost/iostreams/device/mapped_file.hpp> #include<optional> #include<algorithm> #include<iterator> #include<execution> #include"dependencies/xxhash.hpp" // https://github.com/RedSpah/xxhash_cpp /** * Find ranges (neighbouring elements) of the same value within [begin, end[ and * call callback for each such range * @param begin start of container * @param end end of container (1 beyond last element) * @param function returns value for each iterator V(T&) @param callback void(start, end, value) * @return number of range / template<typename T, typename V, typename F> size_t for_each_adjacent_range(T begin, T end, V getvalue, F callback) { size_t partitions = 0; while (begin != end) { auto const& value = getvalue(begin); auto current = begin; while (++current != end && getvalue(current) == value); callback(begin, current, value); ++partitions; begin = current; } return partitions; } namespace bi = boost::iostreams; namespace fs = boost::filesystem; struct file_entry { public: explicit file_entry(fs::directory_entry const & entry) : path_{entry.path()}, size_{fs::file_size(entry)} {} auto size() const { return size_; } auto const& path() const { return path_; } auto get_hash() { if (!hash_) hash_ = compute_hash(); return hash_; } private: xxh::hash64_t compute_hash() { bi::mapped_file_source source; source.open<fs::wpath>(this->path()); if (!source.is_open()) { std::cerr << "Cannot open " << path() << std::endl; throw std::runtime_error("Cannot open file"); } xxh::hash_state64_t hash_stream; hash_stream.update(source.data(), size_); return hash_stream.digest(); } private: fs::wpath path_; uintmax_t size_; std::optional<xxh::hash64_t> hash_; }; using vector_type = std::vector<file_entry>; using iterator_type = vector_type::iterator; auto find_files_in_dir(fs::wpath const& path, vector_type& file_vector, uintmax_t min_size = 1) { size_t found = 0, ignored = 0; if (!fs::is_directory(path)) { std::cerr << path << " is not a directory!" << std::endl; } else { std::cerr << "Searching " << path << std::endl; for (auto& e : fs::recursive_directory_iterator(path)) { ++found; if (fs::is_regular_file(e) && fs::file_size(e) >= min_size) file_vector.emplace_back(e); else ++ignored; } } return std::make_tuple(found, ignored); } int main(int argn, char* argv[]) { vector_type files; for (auto i = 1; i < argn; ++i) { fs::wpath path(argv[i]); auto [found, ignored] = find_files_in_dir(path, files); std::cerr << boost::format{ " %1$6d files found\n" " %2$6d files ignored\n" " %3$6d files added\n" } % found % ignored % (found - ignored) << std::endl; } std::cerr << "Found " << files.size() << " regular files" << std::endl; // sort files in descending order by file size std::sort(std::execution::par_unseq, files.begin(), files.end() , [](auto const& a, auto const& b) { return a.size() > b.size(); } ); for_each_adjacent_range( std::begin(files) , std::end(files) , [](vector_type::value_type const& f) { return f.size(); } , [](auto start, auto end, auto file_size) { // Files with same size size_t nr_of_files = std::distance(start, end); if (nr_of_files > 1) { // sort range start-end by hash std::sort(start, end, [](auto& a, auto& b) { auto const& ha = a.get_hash(); auto const& hb = b.get_hash(); auto const& pa = a.path(); auto const& pb = b.path(); return std::tie(ha, pa) < std::tie(hb, pb); }); for_each_adjacent_range( start , end , [](vector_type::value_type& f) { return f.get_hash(); } , [file_size](auto hstart, auto hend, auto hash) { // Files with same size and same hash are assumed to be identical // could resort to compare files byte-by-byte now size_t hnr_of_files = std::distance(hstart, hend); if (hnr_of_files > 1) { std::cout << boost::format{ "%1$3d files with hash %3$016x and size %2$d\n" } % hnr_of_files % file_size % hash; std::for_each(hstart, hend, [hash, file_size](auto& e) { std::cout << '\t' << e.path() << '\n'; } ); } } ); } } ); return 0; } </syntaxhighlight> {{out}} <pre>$ ./duplicates /usr/include/boost/ Searching "/usr/include/boost/" 15264 files found 1160 files ignored 14104 files added Found 14104 regular files 2 files with hash 0c5e81a47dd8cd99 and size 15811 "/usr/include/boost/mpl/vector/aux_/preprocessed/no_ctps/vector50_c.hpp" "/usr/include/boost/mpl/vector/aux_/preprocessed/plain/vector50_c.hpp" 3 files with hash 0f2775c41bb647f3 and size 14766 "/usr/include/boost/mpl/aux_/preprocessed/msvc60/vector.hpp" "/usr/include/boost/mpl/aux_/preprocessed/msvc70/vector.hpp" "/usr/include/boost/mpl/aux_/preprocessed/no_ctps/vector.hpp" 3 files with hash f9e02ecc3e38f3a3 and size 14714 "/usr/include/boost/mpl/aux_/preprocessed/msvc60/deque.hpp" "/usr/include/boost/mpl/aux_/preprocessed/msvc70/deque.hpp" "/usr/include/boost/mpl/aux_/preprocessed/no_ctps/deque.hpp" 3 files with hash 73ed6d15fd62f8b3 and size 14620 "/usr/include/boost/mpl/aux_/preprocessed/msvc60/list.hpp" "/usr/include/boost/mpl/aux_/preprocessed/msvc70/list.hpp" "/usr/include/boost/mpl/aux_/preprocessed/no_ctps/list.hpp" 3 files with hash 7a43c97436ae1913 and size 14547 "/usr/include/boost/mpl/aux_/preprocessed/msvc60/set.hpp" "/usr/include/boost/mpl/aux_/preprocessed/msvc70/set.hpp" "/usr/include/boost/mpl/aux_/preprocessed/no_ctps/set.hpp" ... </pre> =={{header\|Elixir}}== {{trans\|Ruby}} <~~lang~~syntaxhighlight lang="elixir">defmodule Files do def find_duplicate_files(dir) do IO.puts "\nDirectory : #{dir}" Line 38 ⟶ 225: end hd(System.argv) \|> Files.find_duplicate_files</~~lang~~syntaxhighlight> {{out}} Line 62 ⟶ 249: =={{header\|Go}}== In theory this should work on any of the operating systems supported by Go (Linux, macOS, Windows, OpenBSD etc.) though only tested on Ubuntu 16.04. <~~lang~~syntaxhighlight lang="go">package main import ( Line 132 ⟶ 319: fmt.Println() } }</~~lang~~syntaxhighlight> {{out}} Line 160 ⟶ 347: - works on Windows as well as Unix Systems (tested with Mint 17 / Windows 7) </pre> <syntaxhighlight lang="haskell"> ~~<lang Haskell>~~ import Crypto.Hash.MD5 (hash) import Data.ByteString as BS (readFile, ByteString()) Line 224 ⟶ 411: printf "Something went wrong - please use ./%s <dir> <bytes>\n" name </syntaxhighlight> ~~</lang>~~ Example output: Line 264 ⟶ 451: Hard links are indicated by displaying the files on the same line separated by "=". MD5 checksums are used to detect duplicate files. <~~lang~~syntaxhighlight lang="java">import java.io.; import java.nio.; import java.nio.file.; Line 401 ⟶ 588: return Integer.compare(len1, len2); } }</~~lang~~syntaxhighlight> {{out}} Line 416 ⟶ 603: file1 = file3 file6 </pre> =={{header\|jq}}== '''Works with jq, the C implementation of jq''' '''Works with gojq, the Go implementation of jq''' '''Works with jaq, the Rust implementation of jq''' This entry illustrates how jq plays nicely with other command-line tools - in this case jc (https://kellyjonbrazil.github.io/jc) is used to JSONify the output of `md5sum`, the idea being that in practice, two files will almost surely have the same contents if their md5sum values are the same. It should be noted, however, that jq can also easily be used to parse the raw output of `md5sum` by using the -R option; the modifications required for the jq program given below would all be trivial to make. We'll also use the `-type f` option of the `find` command as this excludes symbolic links, which we'll assume should be ignored. An appropriate invocation of these three command-line tools would be along the lines of: <pre> jc md5sum $(find . -type f ) \| jq -nc -f find-duplicate-files.jq </pre> The output will be a stream of arrays, each array listing the files with the same hash. <syntaxhighlight lang="jq"> # The following jq program assumes the input consists of a JSON array of objects having # keys named "hash" and "filename". def dictionary(stream; f; g): reduce stream as $x ({}; .[($x\|f)] += [$x\|g]); dictionary(inputs[]; .hash; .filename) \| to_entries[].value \| select(length > 1) \| [.[]] </syntaxhighlight> {{output}} <pre> ["./toplevel.txt","./tmp/toplevel.txt"] </pre> Line 423 ⟶ 651: Should work on Windows, macOS and Linux. <~~lang~~syntaxhighlight ~~Julia~~lang="julia">using Printf, Nettle function find_duplicates(path::String, minsize::Int = 0) Line 467 ⟶ 695: end main()</~~lang~~syntaxhighlight> {{out}} Line 488 ⟶ 716: </pre> =={{header\|Mathematica}}/{{header\|Wolfram Language}}== <syntaxhighlight lang="mathematica">hash="SHA256"; ~~<lang Mathematica>hash="SHA256";~~ minSize=Quantity[1,"Megabytes"]; allfiles=Once@Select[FileNames["","",∞],!Once@DirectoryQ[#]&&Once@FileSize[#]>minSize&]; data={#,Once[FileHash[#,hash,All,"HexString"]]}&/@allfiles[[;;5]]; Grid[Select[GatherBy[data,Last],Length[#]>1&][[All,All,1]]]</~~lang~~syntaxhighlight> {{out\|Sample output}} sample directory: Line 501 ⟶ 728: someFile3 bef0039c33277f743b60b0076871110b96e14de34045aafc8e764349de6043b5 directory\someFile eebe4df6d2951e77973b83af039f6565b215f74113028bbc5d8f96b856947abe directory\someFile4 e6385b50ec8b052b141588573f680261db714babe534d8ced8a17985b14f58e9</pre> ~~</pre>~~ sample output: <pre>35 MB {someFile,directory\someFile}</pre> =={{header\|Nim}}== Our solution works on Linux and likely on any Posix system. To mark hard links, we provide the inode number of the file. Two paths with the same inode number are in fact two links to the same file. To make them more visible, an asterisk is used. The detection of hard links may be OS dependent and may not work on Windows. <syntaxhighlight lang="nim">import algorithm import os import strformat import strutils import tables import std/sha1 import times type # Mapping "size" -> "list of paths". PathsFromSizes = Table[BiggestInt, seq[string]] # Mapping "hash" -> "list fo paths". PathsFromHashes = Table[string, seq[string]] # Information data. Info = tuple[size: BiggestInt; paths: seq[string]] #--------------------------------------------------------------------------------------------------- proc processCmdLine(): tuple[dirpath: string; minsize: Natural] = ## Process the command line. Extra parameters are ignored. if paramCount() == 0: quit fmt"Usage: {getAppFileName().splitPath()[1]} folder minsize" result.dirpath = paramStr(1) if not result.dirpath.dirExists(): quit fmt"Wrong directory path: {result.dirpath}" if paramCount() >= 2: try: result.minsize = parseInt(paramStr(2)) except ValueError: quit fmt"Wrong minimum size: {paramStr(2)}" #--------------------------------------------------------------------------------------------------- proc initPathsFromSize(dirpath: string; minsize: Natural): PathsFromSizes = ## Retrieve the files in directory "dirpath" with minimal size "minsize" ## and build the mapping from size to paths. for path in dirpath.walkDirRec(): if not path.fileExists(): continue # Not a regular file. let size = path.getFileSize() if size >= minSize: # Store path in "size to paths" table. result.mgetOrPut(size, @[]).add(path) #--------------------------------------------------------------------------------------------------- proc initPathsFromHashes(pathsFromSizes: PathsFromSizes): PathsFromHashes = ## Compute hashes for files whose size is not unique and build the mapping ## from hash to paths. for size, paths in pathsFromSizes.pairs: if paths.len > 1: for path in paths: # Store path in "digest to paths" table. result.mgetOrPut($path.secureHashFile(), @[]).add(path) #--------------------------------------------------------------------------------------------------- proc cmp(x, y: Info): int = ## Compare two information tuples. Used to sort the list of duplicates files. result = cmp(x.size, y.size) if result == 0: # Same size. Compare the first paths (we are sure that they are different). result = cmp(x.paths[0], y.paths[0]) #--------------------------------------------------------------------------------------------------- proc displayDuplicates(dirpath: string; pathsFromHashes: PathsFromHashes) = ## Display duplicates files in directory "dirpath". echo "Files with same size and same SHA1 hash value in directory: ", dirpath echo "" # Build list of duplicates. var duplicates: seq[Info] for paths in pathsFromHashes.values: if paths.len > 1: duplicates.add((paths[0].getFileSize(), sorted(paths))) if duplicates.len == 0: echo "No files" return duplicates.sort(cmp, Descending) # Display duplicates. echo fmt"""{"Size":>10} {"Last date modified":^19} {"Inode":>8} HL File name""" echo repeat('=', 80) for (size, paths) in duplicates: echo "" for path in paths: let mtime = path.getLastModificationTime().format("YYYY-MM-dd HH:mm:ss") let info = path.getFileInfo() let inode = info.id.file let hardlink = if info.linkCount == 1: " " else: "" echo fmt"{size:>10} {mtime:>23} {inode:>12} {hardlink:<5} {path.relativePath(dirpath)}" #——————————————————————————————————————————————————————————————————————————————————————————————————— let (dirpath, minsize) = processCmdLine() let pathsFromSizes = initPathsFromSize(dirpath, minsize) let pathsFromHashes = initPathsFromHashes(pathsFromSizes) dirpath.displayDuplicates(pathsFromHashes)</syntaxhighlight> {{out}} <pre>Files with same size and same SHA1 hash value in directory: . Size Last date modified Inode HL File name ================================================================================ 499515 2020-12-10 22:48:06 12981503 subdir/tree.ppm 499515 2020-12-10 22:45:26 12722201 subdir/tree1.ppm 499515 2020-12-10 22:45:26 12722201 * tree.ppm 499515 2020-12-10 22:47:51 12722205 tree1.ppm 65322 2020-12-10 22:44:53 12722178 * house.jpg 65322 2020-12-10 22:44:53 12722178 * house1.jpeg 6401 2020-12-10 22:45:07 12722182 dragon.png 6401 2020-12-10 22:45:53 12722204 dragon1.png 6401 2020-12-10 22:46:21 12981502 subdir/dragon.png</pre> =={{header\|Objeck}}== Solution works on Windows, macOS and Linux. <~~lang~~syntaxhighlight lang="objeck">use System.IO.File; use System.Time; use Collection; Line 593 ⟶ 954: return "{$@name}, {$@size}, {$date_str}"; } }</~~lang~~syntaxhighlight> {{output}} Line 609 ⟶ 970: /tmp/foo/ee.obe, 20020, 3/29/2019 8:47:33 PM /tmp/foo/dd.obe, 20020, 3/29/2019 8:47:14 PM </pre> =={{header\|OCaml}}== Although this solution uses the Unix module, it only calls lstat from there, which exists in the Windows port of the Unix module and so should be portable. <syntaxhighlight lang="ocaml">let readdir_or_empty dir = try Sys.readdir dir with Sys_error e -> prerr_endline ("Could not read dir " ^ dir ^ ": " ^ e); [\|\|] let directory_walk root func = let rec aux dir = readdir_or_empty dir \|> Array.iter (fun filename -> let path = Filename.concat dir filename in let open Unix in let stat = lstat path in match stat.st_kind with \| S_DIR -> aux path \| S_REG -> func path stat \| _ -> ()) in aux root let rec input_retry ic buf pos len = let count = input ic buf pos len in if count = 0 \|\| count = len then count + pos else input_retry ic buf (pos + count) (len - count) let with_file_in_bin fn f = let fh = open_in_bin fn in Fun.protect ~finally:(fun () -> close_in fh) (fun () -> f fh) let is_really_same_file fn1 fn2 = with_file_in_bin fn1 (fun fh1 -> with_file_in_bin fn2 (fun fh2 -> let len = 2048 in let buf1 = Bytes.create len in let buf2 = Bytes.create len in let rec aux () = let read1 = input_retry fh1 buf1 0 len in let read2 = input_retry fh2 buf2 0 len in if read1 <> read2 \|\| buf1 <> buf2 then false else if read1 = 0 then true else aux () in aux ())) let () = let tbl = Hashtbl.create 128 in let seen = Hashtbl.create 128 in let min_size = int_of_string Sys.argv.(2) in directory_walk Sys.argv.(1) (fun path stat -> try let identity_tuple = (stat.st_dev, stat.st_ino) in match Hashtbl.find_opt seen identity_tuple with \| Some existing -> print_endline ("File " ^ existing ^ " is the same hard link as " ^ path) \| None -> ( Hashtbl.add seen identity_tuple path; let size = stat.st_size in if size >= min_size then let digest = Digest.file path in Hashtbl.find_all tbl digest \|> List.find_opt (is_really_same_file path) \|> function \| Some existing -> print_endline ("File " ^ existing ^ " matches " ^ path) \| None -> Hashtbl.add tbl digest path) with Sys_error e -> prerr_endline ("Could not hash " ^ path ^ ": " ^ e)) </syntaxhighlight> {{output}} <pre>$ dune build @fmt $ dune exec ./finddupes.exe . 1024 File ./finddupes.ml matches ./_build/default/finddupes.ml File ./finddupes.ml matches ./_build/default/.formatted/finddupes.ml </pre> =={{header\|Perl}}== For supplied directory, compare all files, recursing into sub-directories. By default, showing duplicate files of 1 byte or larger, configurable with command-line option. Using CPAN <code>File</code> modules for enhanced portability. <~~lang~~syntaxhighlight lang="perl">use File::Find qw(find); use File::Compare qw(compare); use Sort::Naturally; Line 651 ⟶ 1,090: } print for find_dups(@ARGV);</~~lang~~syntaxhighlight> {{out}} <pre> 372 aaa.txt, dir2/aaa.txt Line 658 ⟶ 1,097: =={{header\|Phix}}== Works on Windows and Linux. No handling of hard (or soft) links. <!--<syntaxhighlight lang="phix">(notonline)--> ~~<lang Phix>integer min_size=1~~ <span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- file i/o</span> ~~sequence res = {}~~ <span style="color: #004080;">integer</span> <span style="color: #000000;">min_size</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> ~~atom t1 = time()+1~~ <span style="color: #004080;">sequence</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #004080;">atom</span> <span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span> ~~function store_res(string filepath, sequence dir_entry)~~ ~~if not match("backup",filepath) -- (example filter)~~ <span style="color: #008080;">function</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">)</span> ~~and not find('d', dir_entry[D_ATTRIBUTES]) then~~ <span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"backup"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">filepath</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- (example filter)</span> ~~atom size = dir_entry[D_SIZE]~~ <span style="color: #008080;">and</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'d'</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_ATTRIBUTES</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span> ~~if size>=min_size then~~ <span style="color: #004080;">atom</span> <span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_SIZE</span><span style="color: #0000FF;">]</span> ~~res = append(res,{size,filepath,dir_entry})~~ <span style="color: #008080;">if</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">>=</span><span style="color: #000000;">min_size</span> <span style="color: #008080;">then</span> ~~if time()>t1 then~~ <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">size</span><span style="color: #0000FF;">,</span><span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span><span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">})</span> ~~printf(1,"%d files found\r",length(res))~~ <span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span> ~~t1 = time()+1~~ <span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d files found\r"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span> ~~end if~~ <span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span> ~~end if~~ <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> ~~end if~~ <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> ~~return 0 -- keep going~~ <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> ~~end function~~ <span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- keep going</span> ~~integer exit_code = walk_dir("demo\\clocks\\love", routine_id("store_res"), true)~~ <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">exit_code</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">walk_dir</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"demo\\clocks\\love"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">,</span> <span style="color: #004600;">true</span><span style="color: #0000FF;">)</span> ~~res = sort(res,DESCENDING)~~ ~~printf(1,"%d files found\n",length(res))~~ <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span><span style="color: #000000;">DESCENDING</span><span style="color: #0000FF;">)</span> <span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d files found\n"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span> ~~integer duplicates = 0~~ ~~for i=1 to length(res)-1 do~~ <span style="color: #004080;">integer</span> <span style="color: #000000;">duplicates</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> ~~for j=i+1 to length(res) do~~ <span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)-</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span> ~~if res[i][1]!=res[j][1] then exit end if~~ <span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">i</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span> ~~string si = join_path({res[i][2],res[i][3][D_NAME]}),~~ <span style="color: #008080;">if</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]!=</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> ~~sj = join_path({res[j][2],res[j][3][D_NAME]})~~ <span style="color: #004080;">string</span> <span style="color: #000000;">si</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join_path</span><span style="color: #0000FF;">({</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">3</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]}),</span> ~~integer fni = open(si,"rb"),~~ <span style="color: #000000;">sj</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join_path</span><span style="color: #0000FF;">({</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">3</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]})</span> ~~fnj = open(sj,"rb"),~~ <span style="color: #004080;">integer</span> <span style="color: #000000;">fni</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">si</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">),</span> ~~size = res[i][1]~~ <span style="color: #000000;">fnj</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sj</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">),</span> ~~bool same = true~~ <span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> ~~if fni=-1 or fnj=-1 then ?9/0 end if~~ <span style="color: #004080;">bool</span> <span style="color: #000000;">same</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">true</span> ~~for k=1 to size+1 do -- (check eof as well)~~ <span style="color: #008080;">if</span> <span style="color: #000000;">fni</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">or</span> <span style="color: #000000;">fnj</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> ~~if getc(fni)!=getc(fnj) then~~ <span style="color: #008080;">for</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span> <span style="color: #000080;font-style:italic;">-- (check eof as well)</span> ~~same = false~~ <span style="color: #008080;">if</span> <span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fni</span><span style="color: #0000FF;">)!=</span><span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fnj</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> ~~exit~~ <span style="color: #000000;">same</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span> ~~end if~~ <span style="color: #008080;">exit</span> ~~end for~~ <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> ~~close(fni)~~ <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> ~~close(fnj)~~ <span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fni</span><span style="color: #0000FF;">)</span> ~~if same then~~ <span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fnj</span><span style="color: #0000FF;">)</span> ~~-- prettifying the output left as an exercise...~~ <span style="color: #008080;">if</span> <span style="color: #000000;">same</span> <span style="color: #008080;">then</span> ~~?res[i]~~ <span style="color: #000080;font-style:italic;">-- prettifying the output left as an exercise...</span> ~~?res[j]~~ <span style="color: #0000FF;">?</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> ~~duplicates += 1~~ <span style="color: #0000FF;">?</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">]</span> ~~end if~~ <span style="color: #000000;">duplicates</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span> ~~end for~~ <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> ~~if time()>t1 then~~ <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> ~~printf(1,"processing %d/%d...\r",{i,length(res)})~~ <span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span> ~~t1 = time()+1~~ <span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"processing %d/%d...\r"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)})</span> ~~end if~~ <span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span> ~~end for~~ <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> ~~printf(1,"%d duplicates found\n",duplicates)</lang>~~ <span style="color: #008080;">end</span> <span style="color: #008080;">for</span> <span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d duplicates found\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">duplicates</span><span style="color: #0000FF;">)</span> <!--</syntaxhighlight>--> {{out}} <pre> Line 737 ⟶ 1,179: =={{header\|PicoLisp}}== File duplicates in /bin dir on Void Linux. Hash provided by xxhash library via mmap. <~~lang~~syntaxhighlight ~~PicoLisp~~lang="picolisp">`(== 64 64) (de mmap (L F) (native "@" "mmap" 'N 0 L 1 2 F 0) ) Line 770 ⟶ 1,212: group (cadr Lst) ) (and (filter cdr L) (println (car Lst) @)) ) )</~~lang~~syntaxhighlight> {{out}} <pre> Line 792 ⟶ 1,234: =={{header\|Python}}== <~~lang~~syntaxhighlight lang="python">from __future__ import print_function import os import hashlib Line 843 ⟶ 1,285: FindDuplicateFiles('/home/tim/Dropbox', 10241024) </syntaxhighlight> ~~</lang>~~ =={{header\|Racket}}== <~~lang~~syntaxhighlight lang="racket"> #lang racket Line 886 ⟶ 1,328: (show-duplicates (find-system-path 'home-dir) 1024) </syntaxhighlight> ~~</lang>~~ =={{header\|Raku}}== (formerly Perl 6) ~~{{works with\|Rakudo\|2017.06}}~~ This implementation takes a starting directory (defaults to the current directory) and has a few flags to set behaviour: --minsize, minimum file size to look at, defaults to 5 bytes; and --recurse, recurse into the directory structure, default True. It finds files of the same size, calculates hashes to compare, then reports files that hash the same. Uses the very fast but cryptographically poor xxHash library to hash the files. This implementation takes a starting directory (defaults to the current directory) and has a few flags to set behaviour: --minsize, minimum file size to look at, defaults to 5 bytes; and --recurse, recurse into the directory structure, default True. It finds files of the same size, calculates hashes to compare, then reports files that hash the same. ~~<lang perl6>use Digest::xxHash;~~ <syntaxhighlight lang="raku" line>use Digest::SHA256::Native; sub MAIN( $dir = '.', :$minsize = 5, :$recurse = True ) { Line 910 ⟶ 1,352: for %files.sort( +.key ).grep( .value.elems > 1)».kv -> ($size, @list) { my %dups; @list.map: { %dups{ ~~xxHash~~sha256-hex( ($_.slurp :bin).decode ) }.push: $_.Str }; for %dups.grep( .value.elems > 1)».value -> @dups { say sprintf("%9s : ", scale $size ), @dups.join(', '); Line 924 ⟶ 1,366: default { ($bytes / 2*30).round(.1) ~ ' GB' } } }</~~lang~~syntaxhighlight> {{out\|Sample output}} Passing in command line switches: --minsize=0 --recurse=False /home/me/p6 Line 938 ⟶ 1,380: <br>Note that the   '''tFID'''   (temp)   file is hard coded to the   '''C:'''   drive. <br>Only minimal error checking is performed. <~~lang~~syntaxhighlight lang="rexx">/REXX program to reads a (DOS) directory and finds and displays files that identical./ sep=center(' files are identical in size and content: ',79,"═") /define the header. / tFID= 'c:\TEMP\FINDDUP.TMP' /use this as a temporary FileID. / Line 963 ⟶ 1,405: if lines(tFID)\==0 then 'ERASE' tFID /do housecleaning (delete temp file)./ /stick a fork in it, we're all done. /</~~lang~~syntaxhighlight> '''output'''   when using (checking) with the default root directory: <pre> Line 987 ⟶ 1,429: ::   uses variables for some command names and command options ::*   shows the number of files examined and also the directory name <~~lang~~syntaxhighlight lang="rexx">/REXX program to reads a (DOS) directory and finds and displays files that identical./ sep=center(' files are identical in size and content: ',79,"═") /define the header. / parse arg !; if !all(arg()) then exit /boilerplate HELP(?)/ Line 1,043 ⟶ 1,485: novalue: syntax: call err 'REXX program' condition("C") 'error',condition("D"),'REXX source statement (line' sigl"):",sourceline(sigl) p: return word(arg(1),1) s: if arg(1)==1 then return arg(3); return word(arg(2) 's',1)</~~lang~~syntaxhighlight> '''output'''   when using the DIR (folder):   H:\#\REX <pre> Line 1,066 ⟶ 1,508: =={{header\|Ring}}== <~~lang~~syntaxhighlight lang="ring"> # Project : Find duplicate files Line 1,111 ⟶ 1,553: next return alist </syntaxhighlight> ~~</lang>~~ Output: <pre> Line 1,144 ⟶ 1,586: =={{header\|Ruby}}== It confirms once by the file size. When the same, it confirms a digest (md5). <~~lang~~syntaxhighlight lang="ruby">require 'digest/md5' def find_duplicate_files(dir) Line 1,161 ⟶ 1,603: end find_duplicate_files("/Windows/System32")</~~lang~~syntaxhighlight> Sample Output: Line 1,183 ⟶ 1,625: =={{header\|Rust}}== <~~lang~~syntaxhighlight ~~Rust~~lang="rust">use std::{ collections::BTreeMap, fs::{read_dir, File}, Line 1,279 ⟶ 1,721: Ok(()) }</~~lang~~syntaxhighlight> =={{header\|Sidef}}== It uses the portable ''File::Find'' module which means that it should work, virtually, on any platform. <~~lang~~syntaxhighlight lang="ruby"># usage: sidef fdf.sf [size] [dir1] [...] require('File::Find') Line 1,331 ⟶ 1,773: say "#{files.sort.join(%Q[\n])}\n#{'-'*80}" } }</~~lang~~syntaxhighlight> Section of sample output: <pre>% sidef fdf.sf 0 /tmp /usr/bin Line 1,358 ⟶ 1,800: {{tcllib\|fileutil}} {{tcllib\|md5}} <~~lang~~syntaxhighlight lang="tcl">package require fileutil package require md5 Line 1,414 ⟶ 1,856: } } }</~~lang~~syntaxhighlight> Section of sample output: <pre> Line 1,423 ⟶ 1,865: ./compat/zlib/contrib/delphi/zlibd32.mak has duplicates dupe #1: ./compat/zlib/contrib/pascal/zlibd32.mak </pre> =={{header\|Wren}}== {{libheader\|Wren-crypto}} {{libheader\|Wren-sort}} <syntaxhighlight lang="wren">import "io" for Directory, File, Stat import "./crypto" for Sha1 import "./sort" for Sort var findDuplicates = Fn.new { \|dir, minSize\| if (!Directory.exists(dir)) Fiber.abort("Directory does not exist.") var files = Directory.list(dir).where { \|f\| Stat.path("%(dir)/%(f)").size >= minSize } var hashMap = {} for (file in files) { var path = "%(dir)/%(file)" if (Stat.path(path).isDirectory) continue var contents = File.read(path) var hash = Sha1.digest(contents) var exists = hashMap.containsKey(hash) if (exists) { hashMap[hash].add(file) } else { hashMap[hash] = [file] } } var duplicates = [] for (key in hashMap.keys) { if (hashMap[key].count > 1) { var files = hashMap[key] var path = "%(dir)/%(files[0])" var size = Stat.path(path).size duplicates.add([size, files]) } } var cmp = Fn.new { \|i, j\| (j[0] - i[0]).sign } // by decreasing size Sort.insertion(duplicates, cmp) System.print("The sets of duplicate files are:\n") for (dup in duplicates) { System.print("Size %(dup[0]) bytes:") System.print(dup[1].join("\n")) System.print() } } findDuplicates.call("./", 1000)</syntaxhighlight> {{out}} Sample output: <pre> The sets of duplicate files are: Size 57221 bytes: big.wren big2.wren Size 16696 bytes: cls clsc Size 4096 bytes: data.blk data2.blk Size 1415 bytes: circular.wren circular2.wren </pre> Line 1,431 ⟶ 1,939: File findDupFiles.zkl: <~~lang~~syntaxhighlight lang="zkl">include(zkl.h.zkl); const FLAGS=FILE.GLOB.IGNORE_CASE + FILE.GLOB.NO_DIRS; var [const] MsgHash=Import("zklMsgHash"); Line 1,484 ⟶ 1,992: println("Found %d duplicate files:".fmt(files.apply("len").sum(0))); foreach group in (files){ println(" ",group.concat(", ")) }</~~lang~~syntaxhighlight> {{out}} <pre>