Find duplicate files: Difference between revisions

m (added whitespace and a ;Task:.)
 
(13 intermediate revisions by 9 users not shown)
Line 14:
For extra points, detect when whole directory sub-trees are identical, or optionally remove or link identical files.
<br><br>
 
=={{header|C++}}==
The code uses
 
- xxhash_cpp downloaded from https://github.com/RedSpah/xxhash_cpp
 
- boost filesystem, boost format, and boost iostreams
 
Compiles on Windows10 and Linux.
<syntaxhighlight lang="cpp">
#include<iostream>
#include<string>
#include<boost/filesystem.hpp>
#include<boost/format.hpp>
#include<boost/iostreams/device/mapped_file.hpp>
#include<optional>
#include<algorithm>
#include<iterator>
#include<execution>
#include"dependencies/xxhash.hpp" // https://github.com/RedSpah/xxhash_cpp
 
/**
* Find ranges (neighbouring elements) of the same value within [begin, end[ and
* call callback for each such range
* @param begin start of container
* @param end end of container (1 beyond last element)
* @param function returns value for each iterator V(*T&)
* @param callback void(start, end, value)
* @return number of range
*/
template<typename T, typename V, typename F>
size_t for_each_adjacent_range(T begin, T end, V getvalue, F callback) {
size_t partitions = 0;
while (begin != end) {
auto const& value = getvalue(*begin);
auto current = begin;
while (++current != end && getvalue(*current) == value);
callback(begin, current, value);
++partitions;
begin = current;
}
return partitions;
}
 
namespace bi = boost::iostreams;
namespace fs = boost::filesystem;
 
struct file_entry {
public:
explicit file_entry(fs::directory_entry const & entry)
: path_{entry.path()}, size_{fs::file_size(entry)}
{}
auto size() const { return size_; }
auto const& path() const { return path_; }
auto get_hash() {
if (!hash_)
hash_ = compute_hash();
return *hash_;
}
private:
xxh::hash64_t compute_hash() {
bi::mapped_file_source source;
source.open<fs::wpath>(this->path());
if (!source.is_open()) {
std::cerr << "Cannot open " << path() << std::endl;
throw std::runtime_error("Cannot open file");
}
xxh::hash_state64_t hash_stream;
hash_stream.update(source.data(), size_);
return hash_stream.digest();
}
private:
fs::wpath path_;
uintmax_t size_;
std::optional<xxh::hash64_t> hash_;
};
 
using vector_type = std::vector<file_entry>;
using iterator_type = vector_type::iterator;
 
auto find_files_in_dir(fs::wpath const& path, vector_type& file_vector, uintmax_t min_size = 1) {
size_t found = 0, ignored = 0;
if (!fs::is_directory(path)) {
std::cerr << path << " is not a directory!" << std::endl;
}
else {
std::cerr << "Searching " << path << std::endl;
 
for (auto& e : fs::recursive_directory_iterator(path)) {
++found;
if (fs::is_regular_file(e) && fs::file_size(e) >= min_size)
file_vector.emplace_back(e);
else ++ignored;
}
}
return std::make_tuple(found, ignored);
}
 
int main(int argn, char* argv[])
{
vector_type files;
for (auto i = 1; i < argn; ++i) {
fs::wpath path(argv[i]);
auto [found, ignored] = find_files_in_dir(path, files);
std::cerr << boost::format{
" %1$6d files found\n"
" %2$6d files ignored\n"
" %3$6d files added\n" } % found % ignored % (found - ignored)
<< std::endl;
}
 
std::cerr << "Found " << files.size() << " regular files" << std::endl;
// sort files in descending order by file size
std::sort(std::execution::par_unseq, files.begin(), files.end()
, [](auto const& a, auto const& b) { return a.size() > b.size(); }
);
for_each_adjacent_range(
std::begin(files)
, std::end(files)
, [](vector_type::value_type const& f) { return f.size(); }
, [](auto start, auto end, auto file_size) {
// Files with same size
size_t nr_of_files = std::distance(start, end);
if (nr_of_files > 1) {
// sort range start-end by hash
std::sort(start, end, [](auto& a, auto& b) {
auto const& ha = a.get_hash();
auto const& hb = b.get_hash();
auto const& pa = a.path();
auto const& pb = b.path();
return std::tie(ha, pa) < std::tie(hb, pb);
});
for_each_adjacent_range(
start
, end
, [](vector_type::value_type& f) { return f.get_hash(); }
, [file_size](auto hstart, auto hend, auto hash) {
// Files with same size and same hash are assumed to be identical
// could resort to compare files byte-by-byte now
size_t hnr_of_files = std::distance(hstart, hend);
if (hnr_of_files > 1) {
std::cout << boost::format{ "%1$3d files with hash %3$016x and size %2$d\n" }
% hnr_of_files % file_size % hash;
std::for_each(hstart, hend, [hash, file_size](auto& e) {
std::cout << '\t' << e.path() << '\n';
}
);
}
}
);
}
}
);
return 0;
}
 
</syntaxhighlight>
{{out}}
<pre>$ ./duplicates /usr/include/boost/
Searching "/usr/include/boost/"
15264 files found
1160 files ignored
14104 files added
 
Found 14104 regular files
2 files with hash 0c5e81a47dd8cd99 and size 15811
"/usr/include/boost/mpl/vector/aux_/preprocessed/no_ctps/vector50_c.hpp"
"/usr/include/boost/mpl/vector/aux_/preprocessed/plain/vector50_c.hpp"
3 files with hash 0f2775c41bb647f3 and size 14766
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/vector.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/vector.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/vector.hpp"
3 files with hash f9e02ecc3e38f3a3 and size 14714
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/deque.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/deque.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/deque.hpp"
3 files with hash 73ed6d15fd62f8b3 and size 14620
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/list.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/list.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/list.hpp"
3 files with hash 7a43c97436ae1913 and size 14547
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/set.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/set.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/set.hpp"
...
</pre>
 
=={{header|Elixir}}==
{{trans|Ruby}}
<langsyntaxhighlight lang="elixir">defmodule Files do
def find_duplicate_files(dir) do
IO.puts "\nDirectory : #{dir}"
Line 38 ⟶ 225:
end
 
hd(System.argv) |> Files.find_duplicate_files</langsyntaxhighlight>
 
{{out}}
Line 62 ⟶ 249:
=={{header|Go}}==
In theory this should work on any of the operating systems supported by Go (Linux, macOS, Windows, OpenBSD etc.) though only tested on Ubuntu 16.04.
<langsyntaxhighlight lang="go">package main
 
import (
Line 132 ⟶ 319:
fmt.Println()
}
}</langsyntaxhighlight>
 
{{out}}
Line 160 ⟶ 347:
- works on Windows as well as Unix Systems (tested with Mint 17 / Windows 7)
</pre>
<syntaxhighlight lang="haskell">
<lang Haskell>
import Crypto.Hash.MD5 (hash)
import Data.ByteString as BS (readFile, ByteString())
Line 224 ⟶ 411:
printf "Something went wrong - please use ./%s <dir> <bytes>\n" name
 
</syntaxhighlight>
</lang>
 
Example output:
Line 264 ⟶ 451:
Hard links are indicated by displaying the files on the same line separated by "=".
MD5 checksums are used to detect duplicate files.
<langsyntaxhighlight lang="java">import java.io.*;
import java.nio.*;
import java.nio.file.*;
Line 401 ⟶ 588:
return Integer.compare(len1, len2);
}
}</langsyntaxhighlight>
 
{{out}}
Line 416 ⟶ 603:
file1 = file3
file6
</pre>
 
=={{header|jq}}==
'''Works with jq, the C implementation of jq'''
 
'''Works with gojq, the Go implementation of jq'''
 
'''Works with jaq, the Rust implementation of jq'''
 
This entry illustrates how jq plays nicely with other command-line
tools -
in this case jc (https://kellyjonbrazil.github.io/jc) is used to JSONify the output of `md5sum`,
the idea being that in practice, two files will almost surely have the same
contents if their md5sum values are the same. It should be noted, however,
that jq can also easily be used to parse the raw output of `md5sum` by using the -R option;
the modifications required for the jq program given below would all be trivial to make.
We'll also use the `-type f` option of the `find` command as this excludes symbolic links,
which we'll assume should be ignored.
 
An appropriate invocation of these three command-line tools would be along the lines of:
 
<pre>
jc md5sum $(find . -type f ) | jq -nc -f find-duplicate-files.jq
</pre>
 
The output will be a stream of arrays, each array listing the files with the same hash.
<syntaxhighlight lang="jq">
# The following jq program assumes the input consists of a JSON array of objects having
# keys named "hash" and "filename".
def dictionary(stream; f; g):
reduce stream as $x ({}; .[($x|f)] += [$x|g]);
 
dictionary(inputs[]; .hash; .filename)
| to_entries[].value
| select(length > 1)
| [.[]]
</syntaxhighlight>
{{output}}
<pre>
["./toplevel.txt","./tmp/toplevel.txt"]
</pre>
 
Line 423 ⟶ 651:
Should work on Windows, macOS and Linux.
 
<langsyntaxhighlight Julialang="julia">using Printf, Nettle
 
function find_duplicates(path::String, minsize::Int = 0)
Line 467 ⟶ 695:
end
 
main()</langsyntaxhighlight>
 
{{out}}
Line 488 ⟶ 716:
</pre>
 
=={{header|Mathematica}}/{{header|Wolfram Language}}==
<syntaxhighlight lang="mathematica">hash="SHA256";
 
<lang Mathematica>hash="SHA256";
minSize=Quantity[1,"Megabytes"];
allfiles=Once@Select[FileNames["*","",∞],!Once@DirectoryQ[#]&&Once@FileSize[#]>minSize&];
data={#,Once[FileHash[#,hash,All,"HexString"]]}&/@allfiles[[;;5]];
Grid[Select[GatherBy[data,Last],Length[#]>1&][[All,All,1]]]</langsyntaxhighlight>
{{out|Sample output}}
sample directory:
Line 501 ⟶ 728:
someFile3 bef0039c33277f743b60b0076871110b96e14de34045aafc8e764349de6043b5
directory\someFile eebe4df6d2951e77973b83af039f6565b215f74113028bbc5d8f96b856947abe
directory\someFile4 e6385b50ec8b052b141588573f680261db714babe534d8ced8a17985b14f58e9</pre>
</pre>
sample output:
<pre>35 MB {someFile,directory\someFile}</pre>
 
=={{header|Nim}}==
Our solution works on Linux and likely on any Posix system. To mark hard links, we provide the inode number of the file. Two paths with the same inode number are in fact two links to the same file. To make them more visible, an asterisk is used.
 
The detection of hard links may be OS dependent and may not work on Windows.
 
<syntaxhighlight lang="nim">import algorithm
import os
import strformat
import strutils
import tables
import std/sha1
import times
 
type
 
# Mapping "size" -> "list of paths".
PathsFromSizes = Table[BiggestInt, seq[string]]
 
# Mapping "hash" -> "list fo paths".
PathsFromHashes = Table[string, seq[string]]
 
# Information data.
Info = tuple[size: BiggestInt; paths: seq[string]]
 
 
#---------------------------------------------------------------------------------------------------
 
proc processCmdLine(): tuple[dirpath: string; minsize: Natural] =
## Process the command line. Extra parameters are ignored.
 
if paramCount() == 0:
quit fmt"Usage: {getAppFileName().splitPath()[1]} folder minsize"
 
result.dirpath = paramStr(1)
if not result.dirpath.dirExists():
quit fmt"Wrong directory path: {result.dirpath}"
 
if paramCount() >= 2:
try:
result.minsize = parseInt(paramStr(2))
except ValueError:
quit fmt"Wrong minimum size: {paramStr(2)}"
 
#---------------------------------------------------------------------------------------------------
 
proc initPathsFromSize(dirpath: string; minsize: Natural): PathsFromSizes =
## Retrieve the files in directory "dirpath" with minimal size "minsize"
## and build the mapping from size to paths.
 
for path in dirpath.walkDirRec():
if not path.fileExists():
continue # Not a regular file.
let size = path.getFileSize()
if size >= minSize:
# Store path in "size to paths" table.
result.mgetOrPut(size, @[]).add(path)
 
#---------------------------------------------------------------------------------------------------
 
proc initPathsFromHashes(pathsFromSizes: PathsFromSizes): PathsFromHashes =
## Compute hashes for files whose size is not unique and build the mapping
## from hash to paths.
 
for size, paths in pathsFromSizes.pairs:
if paths.len > 1:
for path in paths:
# Store path in "digest to paths" table.
result.mgetOrPut($path.secureHashFile(), @[]).add(path)
 
#---------------------------------------------------------------------------------------------------
 
proc cmp(x, y: Info): int =
## Compare two information tuples. Used to sort the list of duplicates files.
 
result = cmp(x.size, y.size)
if result == 0:
# Same size. Compare the first paths (we are sure that they are different).
result = cmp(x.paths[0], y.paths[0])
 
#---------------------------------------------------------------------------------------------------
 
proc displayDuplicates(dirpath: string; pathsFromHashes: PathsFromHashes) =
## Display duplicates files in directory "dirpath".
 
echo "Files with same size and same SHA1 hash value in directory: ", dirpath
echo ""
 
# Build list of duplicates.
var duplicates: seq[Info]
for paths in pathsFromHashes.values:
if paths.len > 1:
duplicates.add((paths[0].getFileSize(), sorted(paths)))
if duplicates.len == 0:
echo "No files"
return
duplicates.sort(cmp, Descending)
 
# Display duplicates.
echo fmt"""{"Size":>10} {"Last date modified":^19} {"Inode":>8} HL File name"""
echo repeat('=', 80)
for (size, paths) in duplicates:
echo ""
for path in paths:
let mtime = path.getLastModificationTime().format("YYYY-MM-dd HH:mm:ss")
let info = path.getFileInfo()
let inode = info.id.file
let hardlink = if info.linkCount == 1: " " else: "*"
echo fmt"{size:>10} {mtime:>23} {inode:>12} {hardlink:<5} {path.relativePath(dirpath)}"
 
 
#———————————————————————————————————————————————————————————————————————————————————————————————————
 
let (dirpath, minsize) = processCmdLine()
let pathsFromSizes = initPathsFromSize(dirpath, minsize)
let pathsFromHashes = initPathsFromHashes(pathsFromSizes)
dirpath.displayDuplicates(pathsFromHashes)</syntaxhighlight>
 
{{out}}
<pre>Files with same size and same SHA1 hash value in directory: .
 
Size Last date modified Inode HL File name
================================================================================
 
499515 2020-12-10 22:48:06 12981503 subdir/tree.ppm
499515 2020-12-10 22:45:26 12722201 * subdir/tree1.ppm
499515 2020-12-10 22:45:26 12722201 * tree.ppm
499515 2020-12-10 22:47:51 12722205 tree1.ppm
 
65322 2020-12-10 22:44:53 12722178 * house.jpg
65322 2020-12-10 22:44:53 12722178 * house1.jpeg
 
6401 2020-12-10 22:45:07 12722182 dragon.png
6401 2020-12-10 22:45:53 12722204 dragon1.png
6401 2020-12-10 22:46:21 12981502 subdir/dragon.png</pre>
 
=={{header|Objeck}}==
Solution works on Windows, macOS and Linux.
<langsyntaxhighlight lang="objeck">use System.IO.File;
use System.Time;
use Collection;
Line 593 ⟶ 954:
return "{$@name}, {$@size}, {$date_str}";
}
}</langsyntaxhighlight>
 
{{output}}
Line 609 ⟶ 970:
/tmp/foo/ee.obe, 20020, 3/29/2019 8:47:33 PM
/tmp/foo/dd.obe, 20020, 3/29/2019 8:47:14 PM
</pre>
 
=={{header|OCaml}}==
Although this solution uses the Unix module, it only calls lstat from there, which exists in the Windows port of the Unix module and so should be portable.
<syntaxhighlight lang="ocaml">let readdir_or_empty dir =
try Sys.readdir dir
with Sys_error e ->
prerr_endline ("Could not read dir " ^ dir ^ ": " ^ e);
[||]
 
let directory_walk root func =
let rec aux dir =
readdir_or_empty dir
|> Array.iter (fun filename ->
let path = Filename.concat dir filename in
let open Unix in
let stat = lstat path in
match stat.st_kind with
| S_DIR -> aux path
| S_REG -> func path stat
| _ -> ())
in
aux root
 
let rec input_retry ic buf pos len =
let count = input ic buf pos len in
if count = 0 || count = len then count + pos
else input_retry ic buf (pos + count) (len - count)
 
let with_file_in_bin fn f =
let fh = open_in_bin fn in
Fun.protect ~finally:(fun () -> close_in fh) (fun () -> f fh)
 
let is_really_same_file fn1 fn2 =
with_file_in_bin fn1 (fun fh1 ->
with_file_in_bin fn2 (fun fh2 ->
let len = 2048 in
let buf1 = Bytes.create len in
let buf2 = Bytes.create len in
let rec aux () =
let read1 = input_retry fh1 buf1 0 len in
let read2 = input_retry fh2 buf2 0 len in
if read1 <> read2 || buf1 <> buf2 then false
else if read1 = 0 then true
else aux ()
in
aux ()))
 
let () =
let tbl = Hashtbl.create 128 in
let seen = Hashtbl.create 128 in
let min_size = int_of_string Sys.argv.(2) in
directory_walk Sys.argv.(1) (fun path stat ->
try
let identity_tuple = (stat.st_dev, stat.st_ino) in
match Hashtbl.find_opt seen identity_tuple with
| Some existing ->
print_endline
("File " ^ existing ^ " is the same hard link as " ^ path)
| None -> (
Hashtbl.add seen identity_tuple path;
let size = stat.st_size in
if size >= min_size then
let digest = Digest.file path in
Hashtbl.find_all tbl digest
|> List.find_opt (is_really_same_file path)
|> function
| Some existing ->
print_endline ("File " ^ existing ^ " matches " ^ path)
| None -> Hashtbl.add tbl digest path)
with Sys_error e -> prerr_endline ("Could not hash " ^ path ^ ": " ^ e))
</syntaxhighlight>
 
{{output}}
<pre>$ dune build @fmt
$ dune exec ./finddupes.exe . 1024
File ./finddupes.ml matches ./_build/default/finddupes.ml
File ./finddupes.ml matches ./_build/default/.formatted/finddupes.ml
</pre>
 
=={{header|Perl}}==
For supplied directory, compare all files, recursing into sub-directories. By default, showing duplicate files of 1 byte or larger, configurable with command-line option. Using CPAN <code>File</code> modules for enhanced portability.
<langsyntaxhighlight lang="perl">use File::Find qw(find);
use File::Compare qw(compare);
use Sort::Naturally;
Line 651 ⟶ 1,090:
}
 
print for find_dups(@ARGV);</langsyntaxhighlight>
{{out}}
<pre> 372 aaa.txt, dir2/aaa.txt
Line 658 ⟶ 1,097:
=={{header|Phix}}==
Works on Windows and Linux. No handling of hard (or soft) links.
<!--<syntaxhighlight lang="phix">(notonline)-->
<lang Phix>integer min_size=1
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- file i/o</span>
sequence res = {}
<span style="color: #004080;">integer</span> <span style="color: #000000;">min_size</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span>
atom t1 = time()+1
<span style="color: #004080;">sequence</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
 
<span style="color: #004080;">atom</span> <span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
function store_res(string filepath, sequence dir_entry)
if not match("backup",filepath) -- (example filter)
<span style="color: #008080;">function</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">)</span>
and not find('d', dir_entry[D_ATTRIBUTES]) then
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"backup"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">filepath</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- (example filter)</span>
atom size = dir_entry[D_SIZE]
<span style="color: #008080;">and</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'d'</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_ATTRIBUTES</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
if size>=min_size then
<span style="color: #004080;">atom</span> <span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_SIZE</span><span style="color: #0000FF;">]</span>
res = append(res,{size,filepath,dir_entry})
<span style="color: #008080;">if</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">>=</span><span style="color: #000000;">min_size</span> <span style="color: #008080;">then</span>
if time()>t1 then
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">size</span><span style="color: #0000FF;">,</span><span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span><span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">})</span>
printf(1,"%d files found\r",length(res))
<span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span>
t1 = time()+1
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d files found\r"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
end if
<span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
return 0 -- keep going
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end function
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- keep going</span>
integer exit_code = walk_dir("demo\\clocks\\love", routine_id("store_res"), true)
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
 
<span style="color: #004080;">integer</span> <span style="color: #000000;">exit_code</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">walk_dir</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"demo\\clocks\\love"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">,</span> <span style="color: #004600;">true</span><span style="color: #0000FF;">)</span>
res = sort(res,DESCENDING)
printf(1,"%d files found\n",length(res))
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span><span style="color: #000000;">DESCENDING</span><span style="color: #0000FF;">)</span>
 
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d files found\n"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
integer duplicates = 0
for i=1 to length(res)-1 do
<span style="color: #004080;">integer</span> <span style="color: #000000;">duplicates</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
for j=i+1 to length(res) do
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)-</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
if res[i][1]!=res[j][1] then exit end if
<span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">i</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
string si = join_path({res[i][2],res[i][3][D_NAME]}),
<span style="color: #008080;">if</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]!=</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
sj = join_path({res[j][2],res[j][3][D_NAME]})
<span style="color: #004080;">string</span> <span style="color: #000000;">si</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join_path</span><span style="color: #0000FF;">({</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">3</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]}),</span>
integer fni = open(si,"rb"),
<span style="color: #000000;">sj</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join_path</span><span style="color: #0000FF;">({</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">3</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]})</span>
fnj = open(sj,"rb"),
<span style="color: #004080;">integer</span> <span style="color: #000000;">fni</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">si</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">),</span>
size = res[i][1]
<span style="color: #000000;">fnj</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sj</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">),</span>
bool same = true
<span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span>
if fni=-1 or fnj=-1 then ?9/0 end if
<span style="color: #004080;">bool</span> <span style="color: #000000;">same</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">true</span>
for k=1 to size+1 do -- (check eof as well)
<span style="color: #008080;">if</span> <span style="color: #000000;">fni</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">or</span> <span style="color: #000000;">fnj</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
if getc(fni)!=getc(fnj) then
<span style="color: #008080;">for</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span> <span style="color: #000080;font-style:italic;">-- (check eof as well)</span>
same = false
<span style="color: #008080;">if</span> <span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fni</span><span style="color: #0000FF;">)!=</span><span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fnj</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
exit
<span style="color: #000000;">same</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span>
end if
<span style="color: #008080;">exit</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
close(fni)
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
close(fnj)
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fni</span><span style="color: #0000FF;">)</span>
if same then
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fnj</span><span style="color: #0000FF;">)</span>
-- prettifying the output left as an exercise...
<span style="color: #008080;">if</span> <span style="color: #000000;">same</span> <span style="color: #008080;">then</span>
?res[i]
<span style="color: #000080;font-style:italic;">-- prettifying the output left as an exercise...</span>
?res[j]
<span style="color: #0000FF;">?</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
duplicates += 1
<span style="color: #0000FF;">?</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">]</span>
end if
<span style="color: #000000;">duplicates</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
if time()>t1 then
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
printf(1,"processing %d/%d...\r",{i,length(res)})
<span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span>
t1 = time()+1
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"processing %d/%d...\r"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)})</span>
end if
<span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
printf(1,"%d duplicates found\n",duplicates)</lang>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d duplicates found\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">duplicates</span><span style="color: #0000FF;">)</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
Line 737 ⟶ 1,179:
=={{header|PicoLisp}}==
File duplicates in /bin dir on Void Linux. Hash provided by xxhash library via mmap.
<langsyntaxhighlight PicoLisplang="picolisp">`(== 64 64)
(de mmap (L F)
(native "@" "mmap" 'N 0 L 1 2 F 0) )
Line 770 ⟶ 1,212:
group
(cadr Lst) )
(and (filter cdr L) (println (car Lst) @)) ) )</langsyntaxhighlight>
{{out}}
<pre>
Line 792 ⟶ 1,234:
=={{header|Python}}==
 
<langsyntaxhighlight lang="python">from __future__ import print_function
import os
import hashlib
Line 843 ⟶ 1,285:
 
FindDuplicateFiles('/home/tim/Dropbox', 1024*1024)
</syntaxhighlight>
</lang>
 
=={{header|Racket}}==
<langsyntaxhighlight lang="racket">
#lang racket
 
Line 886 ⟶ 1,328:
 
(show-duplicates (find-system-path 'home-dir) 1024)
</syntaxhighlight>
</lang>
 
=={{header|Raku}}==
(formerly Perl 6)
{{works with|Rakudo|2017.06}}
This implementation takes a starting directory (defaults to the current directory) and has a few flags to set behaviour: --minsize, minimum file size to look at, defaults to 5 bytes; and --recurse, recurse into the directory structure, default True. It finds files of the same size, calculates hashes to compare, then reports files that hash the same. Uses the very fast but cryptographically poor xxHash library to hash the files.
 
This implementation takes a starting directory (defaults to the current directory) and has a few flags to set behaviour: --minsize, minimum file size to look at, defaults to 5 bytes; and --recurse, recurse into the directory structure, default True. It finds files of the same size, calculates hashes to compare, then reports files that hash the same.
<lang perl6>use Digest::xxHash;
 
<syntaxhighlight lang="raku" line>use Digest::SHA256::Native;
 
sub MAIN( $dir = '.', :$minsize = 5, :$recurse = True ) {
Line 910 ⟶ 1,352:
for %files.sort( +*.key ).grep( *.value.elems > 1)».kv -> ($size, @list) {
my %dups;
@list.map: { %dups{ xxHashsha256-hex( ($_.slurp :bin).decode ) }.push: $_.Str };
for %dups.grep( *.value.elems > 1)».value -> @dups {
say sprintf("%9s : ", scale $size ), @dups.join(', ');
Line 924 ⟶ 1,366:
default { ($bytes / 2**30).round(.1) ~ ' GB' }
}
}</langsyntaxhighlight>
{{out|Sample output}}
Passing in command line switches: --minsize=0 --recurse=False /home/me/p6
Line 938 ⟶ 1,380:
<br>Note that the &nbsp; '''tFID''' &nbsp; (temp) &nbsp; file is hard coded to the &nbsp; '''C:''' &nbsp; drive.
<br>Only minimal error checking is performed.
<langsyntaxhighlight lang="rexx">/*REXX program to reads a (DOS) directory and finds and displays files that identical.*/
sep=center(' files are identical in size and content: ',79,"═") /*define the header. */
tFID= 'c:\TEMP\FINDDUP.TMP' /*use this as a temporary FileID. */
Line 963 ⟶ 1,405:
 
if lines(tFID)\==0 then 'ERASE' tFID /*do housecleaning (delete temp file).*/
/*stick a fork in it, we're all done. */</langsyntaxhighlight>
'''output''' &nbsp; when using (checking) with the default root directory:
<pre>
Line 987 ⟶ 1,429:
::* &nbsp; uses variables for some command names and command options
::* &nbsp; shows the number of files examined and also the directory name
<langsyntaxhighlight lang="rexx">/*REXX program to reads a (DOS) directory and finds and displays files that identical.*/
sep=center(' files are identical in size and content: ',79,"═") /*define the header. */
parse arg !; if !all(arg()) then exit /*boilerplate HELP(?)*/
Line 1,043 ⟶ 1,485:
novalue: syntax: call err 'REXX program' condition("C") 'error',condition("D"),'REXX source statement (line' sigl"):",sourceline(sigl)
p: return word(arg(1),1)
s: if arg(1)==1 then return arg(3); return word(arg(2) 's',1)</langsyntaxhighlight>
'''output''' &nbsp; when using the DIR (folder): &nbsp; H:\#\REX
<pre>
Line 1,066 ⟶ 1,508:
 
=={{header|Ring}}==
<langsyntaxhighlight lang="ring">
# Project : Find duplicate files
 
Line 1,111 ⟶ 1,553:
next
return alist
</syntaxhighlight>
</lang>
Output:
<pre>
Line 1,144 ⟶ 1,586:
=={{header|Ruby}}==
It confirms once by the file size. When the same, it confirms a digest (md5).
<langsyntaxhighlight lang="ruby">require 'digest/md5'
 
def find_duplicate_files(dir)
Line 1,161 ⟶ 1,603:
end
 
find_duplicate_files("/Windows/System32")</langsyntaxhighlight>
 
Sample Output:
Line 1,183 ⟶ 1,625:
 
=={{header|Rust}}==
<langsyntaxhighlight Rustlang="rust">use std::{
collections::BTreeMap,
fs::{read_dir, File},
Line 1,279 ⟶ 1,721:
 
Ok(())
}</langsyntaxhighlight>
 
=={{header|Sidef}}==
It uses the portable ''File::Find'' module which means that it should work, virtually, on any platform.
<langsyntaxhighlight lang="ruby"># usage: sidef fdf.sf [size] [dir1] [...]
 
require('File::Find')
Line 1,331 ⟶ 1,773:
say "#{files.sort.join(%Q[\n])}\n#{'-'*80}"
}
}</langsyntaxhighlight>
Section of sample output:
<pre>% sidef fdf.sf 0 /tmp /usr/bin
Line 1,358 ⟶ 1,800:
{{tcllib|fileutil}}
{{tcllib|md5}}
<langsyntaxhighlight lang="tcl">package require fileutil
package require md5
 
Line 1,414 ⟶ 1,856:
}
}
}</langsyntaxhighlight>
Section of sample output:
<pre>
Line 1,423 ⟶ 1,865:
./compat/zlib/contrib/delphi/zlibd32.mak has duplicates
dupe #1: ./compat/zlib/contrib/pascal/zlibd32.mak
</pre>
 
=={{header|Wren}}==
{{libheader|Wren-crypto}}
{{libheader|Wren-sort}}
<syntaxhighlight lang="wren">import "io" for Directory, File, Stat
import "./crypto" for Sha1
import "./sort" for Sort
 
var findDuplicates = Fn.new { |dir, minSize|
if (!Directory.exists(dir)) Fiber.abort("Directory does not exist.")
var files = Directory.list(dir).where { |f| Stat.path("%(dir)/%(f)").size >= minSize }
var hashMap = {}
for (file in files) {
var path = "%(dir)/%(file)"
if (Stat.path(path).isDirectory) continue
var contents = File.read(path)
var hash = Sha1.digest(contents)
var exists = hashMap.containsKey(hash)
if (exists) {
hashMap[hash].add(file)
} else {
hashMap[hash] = [file]
}
}
var duplicates = []
for (key in hashMap.keys) {
if (hashMap[key].count > 1) {
var files = hashMap[key]
var path = "%(dir)/%(files[0])"
var size = Stat.path(path).size
duplicates.add([size, files])
}
}
var cmp = Fn.new { |i, j| (j[0] - i[0]).sign } // by decreasing size
Sort.insertion(duplicates, cmp)
System.print("The sets of duplicate files are:\n")
for (dup in duplicates) {
System.print("Size %(dup[0]) bytes:")
System.print(dup[1].join("\n"))
System.print()
}
}
 
findDuplicates.call("./", 1000)</syntaxhighlight>
 
{{out}}
Sample output:
<pre>
The sets of duplicate files are:
 
Size 57221 bytes:
big.wren
big2.wren
 
Size 16696 bytes:
cls
clsc
 
Size 4096 bytes:
data.blk
data2.blk
 
Size 1415 bytes:
circular.wren
circular2.wren
</pre>
 
Line 1,431 ⟶ 1,939:
 
File findDupFiles.zkl:
<langsyntaxhighlight lang="zkl">include(zkl.h.zkl);
const FLAGS=FILE.GLOB.IGNORE_CASE + FILE.GLOB.NO_DIRS;
var [const] MsgHash=Import("zklMsgHash");
Line 1,484 ⟶ 1,992:
println("Found %d duplicate files:".fmt(files.apply("len").sum(0)));
foreach group in (files){ println(" ",group.concat(", ")) }</langsyntaxhighlight>
{{out}}
<pre>
2,458

edits