Find duplicate files: Difference between revisions

(→‎{{header|Perl 6}}: Add a Perl6 example)
 
(36 intermediate revisions by 17 users not shown)
Line 1:
{{task}}
{{draft task}} In a large directory structure it is easy to inadvertently leave unnecessary copies of files around, which can use considerable disk space and create confusion. Create a program which, given a minimum size and a folder/directory, will find all files of at least ''size'' bytes with duplicate contents under the directory and output or show the sets of duplicate files in order of decreasing size.
In a large directory structure it is easy to inadvertently leave unnecessary copies of files around, which can use considerable disk space and create confusion.
 
 
The program may be command-line or graphical, and duplicate content may be determined by direct comparison or by calculating a hash of the data. Specify which filesystems or operating systems your program works with if it has any filesystem- or OS-specific requirements. Identify hard links (filenames referencing the same content) in the output if applicable for the filesystem. For extra points detect when whole directory sub-trees are identical, or optionally remove or link identical files.
;Task:
Create a program which, given a minimum size and a folder/directory, will find all files of at least ''size'' bytes with duplicate contents under the directory and output or show the sets of duplicate files in order of decreasing size.
 
The program may be command-line or graphical, and duplicate content may be determined by direct comparison or by calculating a hash of the data.
 
Specify which filesystems or operating systems your program works with if it has any filesystem- or OS-specific requirements.
 
Identify hard links (filenames referencing the same content) in the output if applicable for the filesystem.
 
For extra points, detect when whole directory sub-trees are identical, or optionally remove or link identical files.
<br><br>
 
=={{header|C++}}==
The code uses
 
- xxhash_cpp downloaded from https://github.com/RedSpah/xxhash_cpp
 
- boost filesystem, boost format, and boost iostreams
 
Compiles on Windows10 and Linux.
<syntaxhighlight lang="cpp">
#include<iostream>
#include<string>
#include<boost/filesystem.hpp>
#include<boost/format.hpp>
#include<boost/iostreams/device/mapped_file.hpp>
#include<optional>
#include<algorithm>
#include<iterator>
#include<execution>
#include"dependencies/xxhash.hpp" // https://github.com/RedSpah/xxhash_cpp
 
/**
* Find ranges (neighbouring elements) of the same value within [begin, end[ and
* call callback for each such range
* @param begin start of container
* @param end end of container (1 beyond last element)
* @param function returns value for each iterator V(*T&)
* @param callback void(start, end, value)
* @return number of range
*/
template<typename T, typename V, typename F>
size_t for_each_adjacent_range(T begin, T end, V getvalue, F callback) {
size_t partitions = 0;
while (begin != end) {
auto const& value = getvalue(*begin);
auto current = begin;
while (++current != end && getvalue(*current) == value);
callback(begin, current, value);
++partitions;
begin = current;
}
return partitions;
}
 
namespace bi = boost::iostreams;
namespace fs = boost::filesystem;
 
struct file_entry {
public:
explicit file_entry(fs::directory_entry const & entry)
: path_{entry.path()}, size_{fs::file_size(entry)}
{}
auto size() const { return size_; }
auto const& path() const { return path_; }
auto get_hash() {
if (!hash_)
hash_ = compute_hash();
return *hash_;
}
private:
xxh::hash64_t compute_hash() {
bi::mapped_file_source source;
source.open<fs::wpath>(this->path());
if (!source.is_open()) {
std::cerr << "Cannot open " << path() << std::endl;
throw std::runtime_error("Cannot open file");
}
xxh::hash_state64_t hash_stream;
hash_stream.update(source.data(), size_);
return hash_stream.digest();
}
private:
fs::wpath path_;
uintmax_t size_;
std::optional<xxh::hash64_t> hash_;
};
 
using vector_type = std::vector<file_entry>;
using iterator_type = vector_type::iterator;
 
auto find_files_in_dir(fs::wpath const& path, vector_type& file_vector, uintmax_t min_size = 1) {
size_t found = 0, ignored = 0;
if (!fs::is_directory(path)) {
std::cerr << path << " is not a directory!" << std::endl;
}
else {
std::cerr << "Searching " << path << std::endl;
 
for (auto& e : fs::recursive_directory_iterator(path)) {
++found;
if (fs::is_regular_file(e) && fs::file_size(e) >= min_size)
file_vector.emplace_back(e);
else ++ignored;
}
}
return std::make_tuple(found, ignored);
}
 
int main(int argn, char* argv[])
{
vector_type files;
for (auto i = 1; i < argn; ++i) {
fs::wpath path(argv[i]);
auto [found, ignored] = find_files_in_dir(path, files);
std::cerr << boost::format{
" %1$6d files found\n"
" %2$6d files ignored\n"
" %3$6d files added\n" } % found % ignored % (found - ignored)
<< std::endl;
}
 
std::cerr << "Found " << files.size() << " regular files" << std::endl;
// sort files in descending order by file size
std::sort(std::execution::par_unseq, files.begin(), files.end()
, [](auto const& a, auto const& b) { return a.size() > b.size(); }
);
for_each_adjacent_range(
std::begin(files)
, std::end(files)
, [](vector_type::value_type const& f) { return f.size(); }
, [](auto start, auto end, auto file_size) {
// Files with same size
size_t nr_of_files = std::distance(start, end);
if (nr_of_files > 1) {
// sort range start-end by hash
std::sort(start, end, [](auto& a, auto& b) {
auto const& ha = a.get_hash();
auto const& hb = b.get_hash();
auto const& pa = a.path();
auto const& pb = b.path();
return std::tie(ha, pa) < std::tie(hb, pb);
});
for_each_adjacent_range(
start
, end
, [](vector_type::value_type& f) { return f.get_hash(); }
, [file_size](auto hstart, auto hend, auto hash) {
// Files with same size and same hash are assumed to be identical
// could resort to compare files byte-by-byte now
size_t hnr_of_files = std::distance(hstart, hend);
if (hnr_of_files > 1) {
std::cout << boost::format{ "%1$3d files with hash %3$016x and size %2$d\n" }
% hnr_of_files % file_size % hash;
std::for_each(hstart, hend, [hash, file_size](auto& e) {
std::cout << '\t' << e.path() << '\n';
}
);
}
}
);
}
}
);
return 0;
}
 
</syntaxhighlight>
{{out}}
<pre>$ ./duplicates /usr/include/boost/
Searching "/usr/include/boost/"
15264 files found
1160 files ignored
14104 files added
 
Found 14104 regular files
2 files with hash 0c5e81a47dd8cd99 and size 15811
"/usr/include/boost/mpl/vector/aux_/preprocessed/no_ctps/vector50_c.hpp"
"/usr/include/boost/mpl/vector/aux_/preprocessed/plain/vector50_c.hpp"
3 files with hash 0f2775c41bb647f3 and size 14766
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/vector.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/vector.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/vector.hpp"
3 files with hash f9e02ecc3e38f3a3 and size 14714
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/deque.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/deque.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/deque.hpp"
3 files with hash 73ed6d15fd62f8b3 and size 14620
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/list.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/list.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/list.hpp"
3 files with hash 7a43c97436ae1913 and size 14547
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/set.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/set.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/set.hpp"
...
</pre>
 
=={{header|Elixir}}==
{{trans|Ruby}}
<langsyntaxhighlight lang="elixir">defmodule Files do
def find_duplicate_files(dir) do
IO.puts "\nDirectory : #{dir}"
Line 26 ⟶ 225:
end
 
hd(System.argv) |> Files.find_duplicate_files</langsyntaxhighlight>
 
{{out}}
Line 46 ⟶ 245:
{{2014, 4, 11}, {13, 39, 56}} 18088 msvcr100_clr0400.dll
{{2014, 4, 11}, {13, 39, 56}} 18088 msvcr110_clr0400.dll
</pre>
 
=={{header|Go}}==
In theory this should work on any of the operating systems supported by Go (Linux, macOS, Windows, OpenBSD etc.) though only tested on Ubuntu 16.04.
<syntaxhighlight lang="go">package main
 
import (
"fmt"
"crypto/md5"
"io/ioutil"
"log"
"os"
"path/filepath"
"sort"
"time"
)
 
type fileData struct {
filePath string
info os.FileInfo
}
 
type hash [16]byte
 
func check(err error) {
if err != nil {
log.Fatal(err)
}
}
 
func checksum(filePath string) hash {
bytes, err := ioutil.ReadFile(filePath)
check(err)
return hash(md5.Sum(bytes))
}
 
func findDuplicates(dirPath string, minSize int64) [][2]fileData {
var dups [][2]fileData
m := make(map[hash]fileData)
werr := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() && info.Size() >= minSize {
h := checksum(path)
fd, ok := m[h]
fd2 := fileData{path, info}
if !ok {
m[h] = fd2
} else {
dups = append(dups, [2]fileData{fd, fd2})
}
}
return nil
})
check(werr)
return dups
}
 
func main() {
dups := findDuplicates(".", 1)
fmt.Println("The following pairs of files have the same size and the same hash:\n")
fmt.Println("File name Size Date last modified")
fmt.Println("==========================================================")
sort.Slice(dups, func(i, j int) bool {
return dups[i][0].info.Size() > dups[j][0].info.Size() // in order of decreasing size
})
for _, dup := range dups {
for i := 0; i < 2; i++ {
d := dup[i]
fmt.Printf("%-20s %8d %v\n", d.filePath, d.info.Size(), d.info.ModTime().Format(time.ANSIC))
}
fmt.Println()
}
}</syntaxhighlight>
 
{{out}}
Sample output:
<pre>
The following pairs of files have the same size and the same hash:
 
File name Size Date last modified
==========================================================
vib.gif 689113 Wed Sep 26 16:33:34 2018
vibrating.gif 689113 Tue Oct 2 00:38:08 2018
 
analysis2.txt 6155 Thu Sep 13 12:19:06 2018
temp/analysis3.txt 6155 Fri Dec 28 15:20:54 2018
 
w_pinstripe.png 2994 Tue Sep 25 12:18:05 2018
wb_pinstripe.png 2994 Tue Sep 25 12:06:53 2018
 
sox.txt 63 Sat Dec 22 21:59:23 2018
sox2.txt 63 Fri Dec 28 12:19:02 2018
</pre>
 
Line 54 ⟶ 347:
- works on Windows as well as Unix Systems (tested with Mint 17 / Windows 7)
</pre>
<syntaxhighlight lang="haskell">
<lang Haskell>
import Crypto.Hash.MD5 (hash)
import Data.ByteString as BS (readFile, ByteString())
Line 118 ⟶ 411:
printf "Something went wrong - please use ./%s <dir> <bytes>\n" name
 
</syntaxhighlight>
</lang>
 
Example output:
Line 154 ⟶ 447:
</pre>
 
=={{header|Perl 6Java}}==
This should work on with any OS or filesystem supported by Java.
{{works with|Rakudo|2017.05}}
Hard links are indicated by displaying the files on the same line separated by "=".
This implementation takes a starting directory (defaults to the current directory) and has a few flags to set behaviour: --minsize, minimum file size to look at, defaults to 5 bytes; and --recurse, recurse into the directory structure, default True. It finds files of the same size, calculates hashes to compare, then reports files that hash the same. Uses the very fast but cryptographically poor xxHash library to hash the files.
MD5 checksums are used to detect duplicate files.
<syntaxhighlight lang="java">import java.io.*;
import java.nio.*;
import java.nio.file.*;
import java.nio.file.attribute.*;
import java.security.*;
import java.util.*;
 
public class DuplicateFiles {
<lang perl6>use Digest::xxHash;
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Directory name and minimum file size are required.");
System.exit(1);
}
try {
findDuplicateFiles(args[0], Long.parseLong(args[1]));
} catch (Exception e) {
e.printStackTrace();
}
}
 
private static void findDuplicateFiles(String directory, long minimumSize)
sub MAIN( $dir = '.', :$minsize = 5, :$recurse = True ) {
throws IOException, NoSuchAlgorithmException {
my %files;
System.out.println("Directory: '" + directory + "', minimum size: " + minimumSize + " bytes.");
my @dirs = $dir.IO.absolute.IO;
Path path = FileSystems.getDefault().getPath(directory);
while @dirs {
FileVisitor visitor = new FileVisitor(path, minimumSize);
my @files = @dirs.pop;
whileFiles.walkFileTree(path, @files {visitor);
System.out.println("The following sets of files have the same size and checksum:");
for @files.pop.dir -> $path {
for (Map.Entry<FileKey, Map<Object, List<String>>> e : visitor.fileMap_.entrySet()) {
%files{ $path.s }.push: $path if $path.f and $path.s >= $minsize;
Map<Object, List<String>> map = e.getValue();
@dirs.push: $path if $path.d and $path.r and $recurse
if (!containsDuplicates(map))
continue;
List<List<String>> fileSets = new ArrayList<>(map.values());
for (List<String> files : fileSets)
Collections.sort(files);
Collections.sort(fileSets, new StringListComparator());
FileKey key = e.getKey();
System.out.println();
System.out.println("Size: " + key.size_ + " bytes");
for (List<String> files : fileSets) {
for (int i = 0, n = files.size(); i < n; ++i) {
if (i > 0)
System.out.print(" = ");
System.out.print(files.get(i));
}
System.out.println();
}
}
}
 
private static class StringListComparator implements Comparator<List<String>> {
for %files.sort( +*.key ).grep( *.value.elems > 1)».kv -> ($size, @list) {
public int compare(List<String> a, List<String> b) {
my %dups;
@list.map: { %dups{ xxHash( :buf-u8(int $_.slurplen1 :bin= a.size(), )len2 }.push:= $_b.Str }size();
for %dups.grep(int *.value.elemsi >= 1)».value0; ->i @dups< len1 && i < len2; ++i) {
say sprintf("%9s : ", scaleint $sizec ),= @dupsa.joinget(', 'i).compareTo(b.get(i));
if (c != 0)
return c;
}
return Integer.compare(len1, len2);
}
}
}
 
private static boolean containsDuplicates(Map<Object, List<String>> map) {
sub scale ($bytes) {
if (map.size() > 1)
given $bytes {
return true;
when $_ < 2**10 { $bytes ~ ' B' }
when $_for (List<String> 2**20files {: ($bytes / 2**10)map.roundvalues(.1)) ~ ' KB' }{
when $_ < 2**30 {if ($bytes / 2**20)files.roundsize(.1) ~> ' MB' }1)
default return { ($bytes / 2**30).round(.1) ~ ' GB' }true;
}
return false;
}
 
}</lang>
private static class FileVisitor extends SimpleFileVisitor<Path> {
private MessageDigest digest_;
private Path directory_;
private long minimumSize_;
private Map<FileKey, Map<Object, List<String>>> fileMap_ = new TreeMap<>();
 
private FileVisitor(Path directory, long minimumSize) throws NoSuchAlgorithmException {
directory_ = directory;
minimumSize_ = minimumSize;
digest_ = MessageDigest.getInstance("MD5");
}
 
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
if (attrs.size() >= minimumSize_) {
FileKey key = new FileKey(file, attrs, getMD5Sum(file));
Map<Object, List<String>> map = fileMap_.get(key);
if (map == null)
fileMap_.put(key, map = new HashMap<>());
List<String> files = map.get(attrs.fileKey());
if (files == null)
map.put(attrs.fileKey(), files = new ArrayList<>());
Path relative = directory_.relativize(file);
files.add(relative.toString());
}
return FileVisitResult.CONTINUE;
}
 
private byte[] getMD5Sum(Path file) throws IOException {
digest_.reset();
try (InputStream in = new FileInputStream(file.toString())) {
byte[] buffer = new byte[8192];
int bytes;
while ((bytes = in.read(buffer)) != -1) {
digest_.update(buffer, 0, bytes);
}
}
return digest_.digest();
}
}
 
private static class FileKey implements Comparable<FileKey> {
private byte[] hash_;
private long size_;
 
private FileKey(Path file, BasicFileAttributes attrs, byte[] hash) throws IOException {
size_ = attrs.size();
hash_ = hash;
}
 
public int compareTo(FileKey other) {
int c = Long.compare(other.size_, size_);
if (c == 0)
c = hashCompare(hash_, other.hash_);
return c;
}
}
 
private static int hashCompare(byte[] a, byte[] b) {
int len1 = a.length, len2 = b.length;
for (int i = 0; i < len1 && i < len2; ++i) {
int c = Byte.compare(a[i], b[i]);
if (c != 0)
return c;
}
return Integer.compare(len1, len2);
}
}</syntaxhighlight>
 
{{out}}
<pre>
Directory: 'test', minimum size: 1000 bytes.
The following sets of files have the same size and checksum:
 
Size: 16370 bytes
file2
file4
file5
 
Size: 8188 bytes
file1 = file3
file6
</pre>
 
=={{header|jq}}==
'''Works with jq, the C implementation of jq'''
 
'''Works with gojq, the Go implementation of jq'''
 
'''Works with jaq, the Rust implementation of jq'''
 
This entry illustrates how jq plays nicely with other command-line
tools -
in this case jc (https://kellyjonbrazil.github.io/jc) is used to JSONify the output of `md5sum`,
the idea being that in practice, two files will almost surely have the same
contents if their md5sum values are the same. It should be noted, however,
that jq can also easily be used to parse the raw output of `md5sum` by using the -R option;
the modifications required for the jq program given below would all be trivial to make.
We'll also use the `-type f` option of the `find` command as this excludes symbolic links,
which we'll assume should be ignored.
 
An appropriate invocation of these three command-line tools would be along the lines of:
 
<pre>
jc md5sum $(find . -type f ) | jq -nc -f find-duplicate-files.jq
</pre>
 
The output will be a stream of arrays, each array listing the files with the same hash.
<syntaxhighlight lang="jq">
# The following jq program assumes the input consists of a JSON array of objects having
# keys named "hash" and "filename".
def dictionary(stream; f; g):
reduce stream as $x ({}; .[($x|f)] += [$x|g]);
 
dictionary(inputs[]; .hash; .filename)
| to_entries[].value
| select(length > 1)
| [.[]]
</syntaxhighlight>
{{output}}
<pre>
["./toplevel.txt","./tmp/toplevel.txt"]
</pre>
 
=={{header|Julia}}==
{{works with|Julia|0.6 and higher}}
This solution uses [https://github.com/staticfloat/Nettle.jl Nettle] package for MD5 hashing.
Should work on Windows, macOS and Linux.
 
<syntaxhighlight lang="julia">using Printf, Nettle
 
function find_duplicates(path::String, minsize::Int = 0)
filesdict = Dict{String,Array{NamedTuple}}()
 
for (root, dirs, files) in walkdir(path), fn in files
filepath = joinpath(root, fn)
filestats = stat(filepath)
 
filestats.size > minsize || continue
 
hash = open(f -> hexdigest("md5", read(f)), filepath)
 
if haskey(filesdict, hash)
push!(filesdict[hash], (path = filepath, stats = filestats))
else
filesdict[hash] = [(path = filepath, stats = filestats)]
end
end
 
# Get duplicates
dups = [tups for tups in values(filesdict) if length(tups) > 1]
 
return dups
 
end
 
function main()
path = "."
println("Finding duplicates in \"$path\"")
dups = find_duplicates(".", 1)
 
println("The following group of files have the same size and the same hash:\n")
println("File name Size last modified")
println("="^76)
 
for files in sort(dups, by = tups -> tups[1].stats.size, rev = true)
for (path, stats) in sort(files, by = tup -> tup.path, rev = true)
@printf("%-44s%8d %s\n", path, stats.size, Libc.strftime(stats.mtime))
end
println()
end
end
 
main()</syntaxhighlight>
 
{{out}}
Sample output:
<pre>
Finding duplicates in "."
The following group of files have the same size and the same hash:
 
File name Size last modified
============================================================================
.\TestExamples\audio_file.mp3 8945229 11/29/2019 7:39:50 PM
.\TestExamples\audio_file-copy.mp3 8945229 11/30/2019 1:03:09 PM
 
.\TestExamples\doc_file.doc 503296 11/29/2019 7:39:49 PM
.\TestExamples\doc_file-txt_copy.txt 503296 11/29/2019 7:40:05 PM
.\TestExamples\doc_file-copy.doc 503296 11/29/2019 7:39:49 PM
 
.\TestExamples\text_file.txt 27 11/29/2019 7:37:12 PM
.\TestExamples\text_file-copy.txt 27 11/29/2019 7:37:12 PM
</pre>
 
=={{header|Mathematica}}/{{header|Wolfram Language}}==
<syntaxhighlight lang="mathematica">hash="SHA256";
minSize=Quantity[1,"Megabytes"];
allfiles=Once@Select[FileNames["*","",∞],!Once@DirectoryQ[#]&&Once@FileSize[#]>minSize&];
data={#,Once[FileHash[#,hash,All,"HexString"]]}&/@allfiles[[;;5]];
Grid[Select[GatherBy[data,Last],Length[#]>1&][[All,All,1]]]</syntaxhighlight>
{{out|Sample output}}
sample directory:
Passing in command line switches: --minsize=0 --r=False /home/me/p6
<pre>someFile eebe4df6d2951e77973b83af039f6565b215f74113028bbc5d8f96b856947abe
<pre> 0 B : /home/me/p6/vor.ppm, /home/me/p6/ns.txt
someFile2 3e6be6db0858c18573af3fde8308fa9759209079e2e372e21ebd6d3c8512d09e
190 B : /home/me/p6/scrub(copy).t, /home/me/p6/scrub.t
someFile3 bef0039c33277f743b60b0076871110b96e14de34045aafc8e764349de6043b5
1.3 KB : /home/me/p6/coco.p6, /home/me/p6/coc.p6
directory\someFile eebe4df6d2951e77973b83af039f6565b215f74113028bbc5d8f96b856947abe
80.5 KB : /home/me/p6/temp.txt, /home/me/p6/temp.html
directory\someFile4 e6385b50ec8b052b141588573f680261db714babe534d8ced8a17985b14f58e9</pre>
279.6 KB : /home/me/p6/pentaflake.svg, /home/me/p6/5nflake.svg</pre>
sample output:
<pre>35 MB {someFile,directory\someFile}</pre>
 
=={{header|Nim}}==
Our solution works on Linux and likely on any Posix system. To mark hard links, we provide the inode number of the file. Two paths with the same inode number are in fact two links to the same file. To make them more visible, an asterisk is used.
 
The detection of hard links may be OS dependent and may not work on Windows.
 
<syntaxhighlight lang="nim">import algorithm
import os
import strformat
import strutils
import tables
import std/sha1
import times
 
type
 
# Mapping "size" -> "list of paths".
PathsFromSizes = Table[BiggestInt, seq[string]]
 
# Mapping "hash" -> "list fo paths".
PathsFromHashes = Table[string, seq[string]]
 
# Information data.
Info = tuple[size: BiggestInt; paths: seq[string]]
 
 
#---------------------------------------------------------------------------------------------------
 
proc processCmdLine(): tuple[dirpath: string; minsize: Natural] =
## Process the command line. Extra parameters are ignored.
 
if paramCount() == 0:
quit fmt"Usage: {getAppFileName().splitPath()[1]} folder minsize"
 
result.dirpath = paramStr(1)
if not result.dirpath.dirExists():
quit fmt"Wrong directory path: {result.dirpath}"
 
if paramCount() >= 2:
try:
result.minsize = parseInt(paramStr(2))
except ValueError:
quit fmt"Wrong minimum size: {paramStr(2)}"
 
#---------------------------------------------------------------------------------------------------
 
proc initPathsFromSize(dirpath: string; minsize: Natural): PathsFromSizes =
## Retrieve the files in directory "dirpath" with minimal size "minsize"
## and build the mapping from size to paths.
 
for path in dirpath.walkDirRec():
if not path.fileExists():
continue # Not a regular file.
let size = path.getFileSize()
if size >= minSize:
# Store path in "size to paths" table.
result.mgetOrPut(size, @[]).add(path)
 
#---------------------------------------------------------------------------------------------------
 
proc initPathsFromHashes(pathsFromSizes: PathsFromSizes): PathsFromHashes =
## Compute hashes for files whose size is not unique and build the mapping
## from hash to paths.
 
for size, paths in pathsFromSizes.pairs:
if paths.len > 1:
for path in paths:
# Store path in "digest to paths" table.
result.mgetOrPut($path.secureHashFile(), @[]).add(path)
 
#---------------------------------------------------------------------------------------------------
 
proc cmp(x, y: Info): int =
## Compare two information tuples. Used to sort the list of duplicates files.
 
result = cmp(x.size, y.size)
if result == 0:
# Same size. Compare the first paths (we are sure that they are different).
result = cmp(x.paths[0], y.paths[0])
 
#---------------------------------------------------------------------------------------------------
 
proc displayDuplicates(dirpath: string; pathsFromHashes: PathsFromHashes) =
## Display duplicates files in directory "dirpath".
 
echo "Files with same size and same SHA1 hash value in directory: ", dirpath
echo ""
 
# Build list of duplicates.
var duplicates: seq[Info]
for paths in pathsFromHashes.values:
if paths.len > 1:
duplicates.add((paths[0].getFileSize(), sorted(paths)))
if duplicates.len == 0:
echo "No files"
return
duplicates.sort(cmp, Descending)
 
# Display duplicates.
echo fmt"""{"Size":>10} {"Last date modified":^19} {"Inode":>8} HL File name"""
echo repeat('=', 80)
for (size, paths) in duplicates:
echo ""
for path in paths:
let mtime = path.getLastModificationTime().format("YYYY-MM-dd HH:mm:ss")
let info = path.getFileInfo()
let inode = info.id.file
let hardlink = if info.linkCount == 1: " " else: "*"
echo fmt"{size:>10} {mtime:>23} {inode:>12} {hardlink:<5} {path.relativePath(dirpath)}"
 
 
#———————————————————————————————————————————————————————————————————————————————————————————————————
 
let (dirpath, minsize) = processCmdLine()
let pathsFromSizes = initPathsFromSize(dirpath, minsize)
let pathsFromHashes = initPathsFromHashes(pathsFromSizes)
dirpath.displayDuplicates(pathsFromHashes)</syntaxhighlight>
 
{{out}}
<pre>Files with same size and same SHA1 hash value in directory: .
 
Size Last date modified Inode HL File name
================================================================================
 
499515 2020-12-10 22:48:06 12981503 subdir/tree.ppm
499515 2020-12-10 22:45:26 12722201 * subdir/tree1.ppm
499515 2020-12-10 22:45:26 12722201 * tree.ppm
499515 2020-12-10 22:47:51 12722205 tree1.ppm
 
65322 2020-12-10 22:44:53 12722178 * house.jpg
65322 2020-12-10 22:44:53 12722178 * house1.jpeg
 
6401 2020-12-10 22:45:07 12722182 dragon.png
6401 2020-12-10 22:45:53 12722204 dragon1.png
6401 2020-12-10 22:46:21 12981502 subdir/dragon.png</pre>
 
=={{header|Objeck}}==
Solution works on Windows, macOS and Linux.
<syntaxhighlight lang="objeck">use System.IO.File;
use System.Time;
use Collection;
 
class Duplicate {
function : Main(args : String[]) ~ Nil {
if(args->Size() = 2) {
file_sets := SortDups(GetDups(args[0], args[1]->ToInt()));
each(i : file_sets) {
file_set := file_sets->Get(i)->As(Vector);
if(file_set->Size() > 1) {
"Duplicates:"->PrintLine();
"----"->PrintLine();
each(j : file_set) {
file_set->Get(j)->As(FileMeta)->ToString()->PrintLine();
};
};
'\n'->Print();
};
};
}
 
function : SortDups(unsorted : Vector) ~ Vector {
sorted := IntMap->New();
 
each(i : unsorted) {
value := unsorted->Get(i)->As(Vector);
key := value->Get(0)->As(FileMeta)->GetSize();
sorted->Insert(key, value);
};
 
return sorted->GetValues();
}
 
function : GetDups(dir : String, size : Int) ~ Vector {
duplicates := StringMap->New();
 
files := Directory->List(dir);
each(i : files) {
file_name := String->New(dir);
file_name += '/';
file_name += files[i];
 
file_size := File->Size(file_name);
if(file_size >= size) {
file_date := File->ModifiedTime(file_name);
file_hash := file_size->ToString();
file_hash += ':';
file_hash += Encryption.Hash->MD5(FileReader->ReadBinaryFile(file_name))->ToString();
file_meta := FileMeta->New(file_name, file_size, file_date, file_hash);
 
file_set := duplicates->Find(file_hash)->As(Vector);
if(file_set = Nil) {
file_set := Vector->New();
duplicates->Insert(file_hash, file_set);
};
file_set->AddBack(file_meta);
};
};
 
return duplicates->GetValues();
}
}
 
class FileMeta {
@name : String;
@size : Int;
@date : Date;
@hash : String;
 
New(name : String, size : Int, date : Date, hash : String) {
@name := name;
@size := size;
@date := date;
@hash := hash;
}
 
method : public : GetSize() ~ Int {
return @size;
}
 
method : public : ToString() ~ String {
date_str := @date->ToShortString();
return "{$@name}, {$@size}, {$date_str}";
}
}</syntaxhighlight>
 
{{output}}
<pre>
$ obr duplicate.obe /tmp/foo 4000
Duplicates:
----
/tmp/foo/bb.obe, 19822, 3/29/2019 8:07:21 PM
/tmp/foo/aa.obe, 19822, 3/29/2019 8:07:17 PM
 
Duplicates:
----
/tmp/foo/hh.obe, 20020, 3/29/2019 8:47:43 PM
/tmp/foo/gg.obe, 20020, 3/29/2019 8:47:37 PM
/tmp/foo/ee.obe, 20020, 3/29/2019 8:47:33 PM
/tmp/foo/dd.obe, 20020, 3/29/2019 8:47:14 PM
</pre>
 
=={{header|OCaml}}==
Although this solution uses the Unix module, it only calls lstat from there, which exists in the Windows port of the Unix module and so should be portable.
<syntaxhighlight lang="ocaml">let readdir_or_empty dir =
try Sys.readdir dir
with Sys_error e ->
prerr_endline ("Could not read dir " ^ dir ^ ": " ^ e);
[||]
 
let directory_walk root func =
let rec aux dir =
readdir_or_empty dir
|> Array.iter (fun filename ->
let path = Filename.concat dir filename in
let open Unix in
let stat = lstat path in
match stat.st_kind with
| S_DIR -> aux path
| S_REG -> func path stat
| _ -> ())
in
aux root
 
let rec input_retry ic buf pos len =
let count = input ic buf pos len in
if count = 0 || count = len then count + pos
else input_retry ic buf (pos + count) (len - count)
 
let with_file_in_bin fn f =
let fh = open_in_bin fn in
Fun.protect ~finally:(fun () -> close_in fh) (fun () -> f fh)
 
let is_really_same_file fn1 fn2 =
with_file_in_bin fn1 (fun fh1 ->
with_file_in_bin fn2 (fun fh2 ->
let len = 2048 in
let buf1 = Bytes.create len in
let buf2 = Bytes.create len in
let rec aux () =
let read1 = input_retry fh1 buf1 0 len in
let read2 = input_retry fh2 buf2 0 len in
if read1 <> read2 || buf1 <> buf2 then false
else if read1 = 0 then true
else aux ()
in
aux ()))
 
let () =
let tbl = Hashtbl.create 128 in
let seen = Hashtbl.create 128 in
let min_size = int_of_string Sys.argv.(2) in
directory_walk Sys.argv.(1) (fun path stat ->
try
let identity_tuple = (stat.st_dev, stat.st_ino) in
match Hashtbl.find_opt seen identity_tuple with
| Some existing ->
print_endline
("File " ^ existing ^ " is the same hard link as " ^ path)
| None -> (
Hashtbl.add seen identity_tuple path;
let size = stat.st_size in
if size >= min_size then
let digest = Digest.file path in
Hashtbl.find_all tbl digest
|> List.find_opt (is_really_same_file path)
|> function
| Some existing ->
print_endline ("File " ^ existing ^ " matches " ^ path)
| None -> Hashtbl.add tbl digest path)
with Sys_error e -> prerr_endline ("Could not hash " ^ path ^ ": " ^ e))
</syntaxhighlight>
 
{{output}}
<pre>$ dune build @fmt
$ dune exec ./finddupes.exe . 1024
File ./finddupes.ml matches ./_build/default/finddupes.ml
File ./finddupes.ml matches ./_build/default/.formatted/finddupes.ml
</pre>
 
=={{header|Perl}}==
For supplied directory, compare all files, recursing into sub-directories. By default, showing duplicate files of 1 byte or larger, configurable with command-line option. Using CPAN <code>File</code> modules for enhanced portability.
<syntaxhighlight lang="perl">use File::Find qw(find);
use File::Compare qw(compare);
use Sort::Naturally;
use Getopt::Std qw(getopts);
 
my %opts;
$opts{s} = 1;
getopts("s:", \%opts);
 
sub find_dups {
my($dir) = @_;
 
my @results;
my %files;
find {
no_chdir => 1,
wanted => sub { lstat; -f _ && (-s >= $opt{s} ) && push @{$files{-s _}}, $_ }
} => $dir;
 
foreach my $files (values %files) {
next unless @$files;
 
my %dups;
foreach my $a (0 .. @$files - 1) {
for (my $b = $a + 1 ; $b < @$files ; $b++) {
next if compare(@$files[$a], @$files[$b]);
push @{$dups{ @$files[$a] }}, splice @$files, $b--, 1;
}
}
 
while (my ($original, $clones) = each %dups) {
push @results, sprintf "%8d %s\n", (stat($original))[7], join ', ', sort $original, @$clones;
}
}
reverse nsort @results;
 
}
 
print for find_dups(@ARGV);</syntaxhighlight>
{{out}}
<pre> 372 aaa.txt, dir2/aaa.txt
29 bbb.txt, dir1/bbb.txt</pre>
 
=={{header|Phix}}==
Works on Windows and Linux. No handling of hard (or soft) links.
<!--<syntaxhighlight lang="phix">(notonline)-->
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- file i/o</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">min_size</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
<span style="color: #004080;">atom</span> <span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"backup"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">filepath</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- (example filter)</span>
<span style="color: #008080;">and</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'d'</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_ATTRIBUTES</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
<span style="color: #004080;">atom</span> <span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_SIZE</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">>=</span><span style="color: #000000;">min_size</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">size</span><span style="color: #0000FF;">,</span><span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span><span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d files found\r"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
<span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- keep going</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">exit_code</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">walk_dir</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"demo\\clocks\\love"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">,</span> <span style="color: #004600;">true</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span><span style="color: #000000;">DESCENDING</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d files found\n"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">duplicates</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)-</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">i</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]!=</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">si</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join_path</span><span style="color: #0000FF;">({</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">3</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]}),</span>
<span style="color: #000000;">sj</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join_path</span><span style="color: #0000FF;">({</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">3</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]})</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fni</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">si</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">),</span>
<span style="color: #000000;">fnj</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sj</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">),</span>
<span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span>
<span style="color: #004080;">bool</span> <span style="color: #000000;">same</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">true</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">fni</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">or</span> <span style="color: #000000;">fnj</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span> <span style="color: #000080;font-style:italic;">-- (check eof as well)</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fni</span><span style="color: #0000FF;">)!=</span><span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fnj</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">same</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fni</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fnj</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">same</span> <span style="color: #008080;">then</span>
<span style="color: #000080;font-style:italic;">-- prettifying the output left as an exercise...</span>
<span style="color: #0000FF;">?</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
<span style="color: #0000FF;">?</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">]</span>
<span style="color: #000000;">duplicates</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"processing %d/%d...\r"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)})</span>
<span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d duplicates found\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">duplicates</span><span style="color: #0000FF;">)</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
136 files found
{2996224,"demo\\clocks\\love\\love-0.9.1-win32",{"love.dll","",2996224,2014,4,1,19,54,33}}
{2996224,"demo\\clocks\\love\\Chemical Me",{"love.dll","a",2996224,2014,4,1,19,54,32}}
{1059840,"demo\\clocks\\love\\love-0.9.1-win32",{"DevIL.dll","",1059840,2014,4,1,19,53,31}}
{1059840,"demo\\clocks\\love\\Chemical Me",{"DevIL.dll","a",1059840,2014,4,1,19,53,30}}
{875472,"demo\\clocks\\love\\love-0.9.1-win32",{"msvcr110.dll","",875472,2012,11,6,0,20,52}}
{875472,"demo\\clocks\\love\\Chemical Me",{"msvcr110.dll","a",875472,2012,11,6,0,20,52}}
{774656,"demo\\clocks\\love\\love-0.9.1-win32",{"SDL2.dll","",774656,2014,4,1,19,53,36}}
{774656,"demo\\clocks\\love\\Chemical Me",{"SDL2.dll","a",774656,2014,4,1,19,53,36}}
{535008,"demo\\clocks\\love\\love-0.9.1-win32",{"msvcp110.dll","",535008,2012,11,6,0,20,52}}
{535008,"demo\\clocks\\love\\Chemical Me",{"msvcp110.dll","a",535008,2012,11,6,0,20,52}}
{349184,"demo\\clocks\\love\\love-0.9.1-win32",{"OpenAL32.dll","",349184,2014,4,1,19,53,33}}
{349184,"demo\\clocks\\love\\Chemical Me",{"OpenAL32.dll","a",349184,2014,4,1,19,53,32}}
{347648,"demo\\clocks\\love\\love-0.9.1-win32",{"lua51.dll","",347648,2014,4,1,19,53,49}}
{347648,"demo\\clocks\\love\\Chemical Me",{"lua51.dll","a",347648,2014,4,1,19,53,48}}
{139264,"demo\\clocks\\love\\love-0.9.1-win32",{"mpg123.dll","",139264,2014,4,1,19,53,52}}
{139264,"demo\\clocks\\love\\Chemical Me",{"mpg123.dll","a",139264,2014,4,1,19,53,52}}
8 duplicates found
</pre>
 
=={{header|PicoLisp}}==
File duplicates in /bin dir on Void Linux. Hash provided by xxhash library via mmap.
<syntaxhighlight lang="picolisp">`(== 64 64)
(de mmap (L F)
(native "@" "mmap" 'N 0 L 1 2 F 0) )
(de munmap (A L)
(native "@" "munmap" 'N A L) )
(de xxh64 (M S)
(let
(R (native "libxxhash.so" "XXH64" 'N M S 0)
P `(** 2 64) )
(if (lt0 R)
(& (+ R P) (dec P))
R ) ) )
(de walk (Dir)
(recur (Dir)
(for F (dir Dir)
(let (Path (pack Dir "/" F) Info (info Path T))
(when (car Info)
(if (=T (car Info))
(recurse Path)
(if (lup D (car Info))
(push (cdr @) Path)
(idx 'D (list (car Info) (cons Path)) T) ) ) ) ) ) ) )
(off D)
(walk "/bin")
(for Lst (filter cdadr (idx 'D))
(let L
(by
'((F)
(let (M (mmap (car Lst) (open F T))
S (car Lst) )
(prog1 (xxh64 M S) (munmap M S)) ) )
group
(cadr Lst) )
(and (filter cdr L) (println (car Lst) @)) ) )</syntaxhighlight>
{{out}}
<pre>
1045 (("/bin/envvars-std" "/bin/envvars"))
1246 (("/bin/pdftexi2dvi" "/bin/texi2pdf"))
2346 (("/bin/gunzip" "/bin/uncompress"))
5719 (("/bin/roff2dvi" "/bin/roff2html" "/bin/roff2pdf" "/bin/roff2ps" "/bin/roff2text" "/bin/roff2x"))
35384 (("/bin/gcc-ar" "/bin/x86_64-unknown-linux-gnu-gcc-ar") ("/bin/gcc-nm" "/bin/x86_64-unknown-linux-gnu-gcc-nm"))
35392 (("/bin/gcc-ranlib" "/bin/x86_64-unknown-linux-gnu-gcc-ranlib"))
36478 (("/bin/aclocal-1.16" "/bin/aclocal"))
45800 (("/bin/perlthanks" "/bin/perlbug"))
178384 (("/bin/unzip" "/bin/zipinfo"))
257949 (("/bin/automake" "/bin/automake-1.16"))
512640 (("/bin/makewhatis" "/bin/mandoc"))
645464 (("/bin/gawk-5.0.1" "/bin/gawk"))
865760 (("/bin/zsh" "/bin/zsh-5.8"))
1129008 (("/bin/gcc" "/bin/x86_64-unknown-linux-gnu-gcc" "/bin/x86_64-unknown-linux-gnu-gcc-9.3.0"))
1133104 (("/bin/c++" "/bin/g++" "/bin/x86_64-unknown-linux-gnu-c++" "/bin/x86_64-unknown-linux-gnu-g++"))
</pre>
 
=={{header|Python}}==
 
<langsyntaxhighlight lang="python">from __future__ import print_function
import os
import hashlib
Line 206 ⟶ 1,240:
 
def FindDuplicateFiles(pth, minSize = 0, hashName = "md5"):
knownFiles = {}
 
#Analyse files
for root, dirs, files in os.walk(pth):
for fina in files:
fullFina = os.path.join(root, fina)
isSymLink = os.path.islink(fullFina)
if isSymLink:
continue # Skip symlinks
si = os.path.getsize(fullFina)
if si < minSize:
continue
if si not in knownFiles:
knownFiles[si] = {}
h = hashlib.new(hashName)
h.update(open(fullFina, "rb").read())
hashed = h.digest()
if hashed in knownFiles[si]:
fileRec = knownFiles[si][hashed]
fileRec.append(fullFina)
else:
knownFiles[si][hashed] = [fullFina]
 
#Print result
sizeList = list(knownFiles.keys())
sizeList.sort(reverse=True)
for si in sizeList:
filesAtThisSize = knownFiles[si]
for hashVal in filesAtThisSize:
if len(filesAtThisSize[hashVal]) < 2:
continue
fullFinaLi = filesAtThisSize[hashVal]
print ("=======Duplicate=======")
for fullFina in fullFinaLi:
st = os.stat(fullFina)
isHardLink = st.st_nlink > 1
infoStr = []
if isHardLink:
infoStr.append("(Hard linked)")
fmtModTime = datetime.datetime.utcfromtimestamp(st.st_mtime).strftime('%Y-%m-%dT%H:%M:%SZ')
print (fmtModTime, si, os.path.relpath(fullFina, pth), " ".join(infoStr))
 
if __name__=="__main__":
 
FindDuplicateFiles('/home/tim/Dropbox', 1024*1024)
</syntaxhighlight>
</lang>
 
=={{header|Racket}}==
<langsyntaxhighlight lang="racket">
#lang racket
 
Line 294 ⟶ 1,328:
 
(show-duplicates (find-system-path 'home-dir) 1024)
</syntaxhighlight>
</lang>
 
=={{header|Raku}}==
(formerly Perl 6)
 
This implementation takes a starting directory (defaults to the current directory) and has a few flags to set behaviour: --minsize, minimum file size to look at, defaults to 5 bytes; and --recurse, recurse into the directory structure, default True. It finds files of the same size, calculates hashes to compare, then reports files that hash the same.
 
<syntaxhighlight lang="raku" line>use Digest::SHA256::Native;
 
sub MAIN( $dir = '.', :$minsize = 5, :$recurse = True ) {
my %files;
my @dirs = $dir.IO.absolute.IO;
while @dirs {
my @files = @dirs.pop;
while @files {
for @files.pop.dir -> $path {
%files{ $path.s }.push: $path if $path.f and $path.s >= $minsize;
@dirs.push: $path if $path.d and $path.r and $recurse
}
}
}
 
for %files.sort( +*.key ).grep( *.value.elems > 1)».kv -> ($size, @list) {
my %dups;
@list.map: { %dups{ sha256-hex( ($_.slurp :bin).decode ) }.push: $_.Str };
for %dups.grep( *.value.elems > 1)».value -> @dups {
say sprintf("%9s : ", scale $size ), @dups.join(', ');
}
}
}
 
sub scale ($bytes) {
given $bytes {
when $_ < 2**10 { $bytes ~ ' B' }
when $_ < 2**20 { ($bytes / 2**10).round(.1) ~ ' KB' }
when $_ < 2**30 { ($bytes / 2**20).round(.1) ~ ' MB' }
default { ($bytes / 2**30).round(.1) ~ ' GB' }
}
}</syntaxhighlight>
{{out|Sample output}}
Passing in command line switches: --minsize=0 --recurse=False /home/me/p6
<pre> 0 B : /home/me/p6/vor.ppm, /home/me/p6/ns.txt
190 B : /home/me/p6/scrub(copy).t, /home/me/p6/scrub.t
1.3 KB : /home/me/p6/coco.p6, /home/me/p6/coc.p6
80.5 KB : /home/me/p6/temp.txt, /home/me/p6/temp.html
279.6 KB : /home/me/p6/pentaflake.svg, /home/me/p6/5nflake.svg</pre>
 
=={{header|REXX}}==
Line 301 ⟶ 1,380:
<br>Note that the &nbsp; '''tFID''' &nbsp; (temp) &nbsp; file is hard coded to the &nbsp; '''C:''' &nbsp; drive.
<br>Only minimal error checking is performed.
<langsyntaxhighlight lang="rexx">/*REXX program to reads a (DOS) directory and finds and displays files that identical.*/
sep=center(' files are identical in size and content: ',79,"═") /*define the header. */
tFID= 'c:\TEMP\FINDDUP.TMP' /*use this as a temporary FileID. */
Line 326 ⟶ 1,405:
 
if lines(tFID)\==0 then 'ERASE' tFID /*do housecleaning (delete temp file).*/
/*stick a fork in it, we're all done. */</langsyntaxhighlight>
'''output''' &nbsp; when using (checking) with the default root directory:
<pre>
Line 350 ⟶ 1,429:
::* &nbsp; uses variables for some command names and command options
::* &nbsp; shows the number of files examined and also the directory name
<langsyntaxhighlight lang="rexx">/*REXX program to reads a (DOS) directory and finds and displays files that identical.*/
sep=center(' files are identical in size and content: ',79,"═") /*define the header. */
parse arg !; if !all(arg()) then exit /*boilerplate HELP(?)*/
Line 406 ⟶ 1,485:
novalue: syntax: call err 'REXX program' condition("C") 'error',condition("D"),'REXX source statement (line' sigl"):",sourceline(sigl)
p: return word(arg(1),1)
s: if arg(1)==1 then return arg(3); return word(arg(2) 's',1)</langsyntaxhighlight>
'''output''' &nbsp; when using the DIR (folder): &nbsp; H:\#\REX
<pre>
Line 426 ⟶ 1,505:
 
1568 files examined in H:\#\REX\
</pre>
 
=={{header|Ring}}==
<syntaxhighlight lang="ring">
# Project : Find duplicate files
 
d = "/Windows/System32"
chdir(d)
dir = dir(d)
dirlist = []
for n = 1 to len(dir)
if dir[n][2] = 0
str = read(dir[n][1])
lenstr = len(str)
add(dirlist,[lenstr,dir[n][1]])
ok
next
see "Directory : " + d + nl
see "--------------------------------------------" + nl
dirlist = sortfirst(dirlist)
line = 0
for n = 1 to len(dirlist)-1
if dirlist[n][1] = dirlist[n+1][1]
see "" + dirlist[n][1] + " " + dirlist[n][2] + nl
see "" + dirlist[n+1][1] + " " + dirlist[n+1][2] + nl
if n < len(dirlist)-2 and dirlist[n+1][1] != dirlist[n+2][1]
line = 1
ok
else
line = 0
ok
if line = 1
see "--------------------------------------------" + nl
ok
next
 
func sortfirst(alist)
for n = 1 to len(alist) - 1
for m = n + 1 to len(alist)
if alist[m][1] < alist[n][1]
swap(alist,m,n)
ok
if alist[m][1] = alist[n][1] and strcmp(alist[m][2],alist[n][2]) < 0
swap(alist,m,n)
ok
next
next
return alist
</syntaxhighlight>
Output:
<pre>
Directory : /Windows/System32
--------------------------------------------
0 nsprs.dll
0 nsprs.tgz
0 nsprs.tgz
0 serauth1.dll
0 serauth1.dll
0 serauth2.dll
--------------------------------------------
16 jm1ixs2.dll
16 qmtn7ft.dll
--------------------------------------------
......
--------------------------------------------
1189376 Windows.Globalization.dll
1189376 wscui.cpl
--------------------------------------------
1192448 Windows.UI.Xaml.Maps.dll
1192448 dfshim.dll
--------------------------------------------
1295360 MSVPXENC.dll
1295360 comres.dll
--------------------------------------------
1311744 SensorsCpl.dll
1311744 msjet40.dll
--------------------------------------------
</pre>
 
=={{header|Ruby}}==
It confirms once by the file size. When the same, it confirms a digest (md5).
<langsyntaxhighlight lang="ruby">require 'digest/md5'
 
def find_duplicate_files(dir)
Line 447 ⟶ 1,603:
end
 
find_duplicate_files("/Windows/System32")</langsyntaxhighlight>
 
Sample Output:
Line 467 ⟶ 1,623:
</pre>
It checked the operation with MS Windows 7.
 
=={{header|Rust}}==
<syntaxhighlight lang="rust">use std::{
collections::BTreeMap,
fs::{read_dir, File},
hash::Hasher,
io::Read,
path::{Path, PathBuf},
};
 
type Duplicates = BTreeMap<(u64, u64), Vec<PathBuf>>;
 
struct DuplicateFinder {
found: Duplicates,
min_size: u64,
}
 
impl DuplicateFinder {
fn search(path: impl AsRef<Path>, min_size: u64) -> std::io::Result<Duplicates> {
let mut result = Self {
found: BTreeMap::new(),
min_size,
};
 
result.walk(path)?;
Ok(result.found)
}
 
fn walk(&mut self, path: impl AsRef<Path>) -> std::io::Result<()> {
let listing = read_dir(path.as_ref())?;
for entry in listing {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
self.walk(path)?;
} else {
self.compute_digest(&path)?;
}
}
 
Ok(())
}
 
fn compute_digest(&mut self, file: &Path) -> std::io::Result<()> {
let size = file.metadata()?.len();
if size < self.min_size {
return Ok(());
}
 
// This hasher is weak, we could otherwise use an external crate
let mut hasher = std::collections::hash_map::DefaultHasher::default();
let mut bytes = [0u8; 8182];
let mut f = File::open(file)?;
loop {
let n = f.read(&mut bytes[..])?;
hasher.write(&bytes[..n]);
if n == 0 {
break;
}
}
 
let hash = hasher.finish();
 
self.found
.entry((size, hash))
.or_insert_with(Vec::new)
.push(file.to_owned());
 
Ok(())
}
}
 
fn main() -> std::io::Result<()> {
let mut args = std::env::args();
 
args.next(); // Skip the executable name
let dir = args.next().unwrap_or_else(|| ".".to_owned());
 
let min_size = args
.next()
.and_then(|arg| arg.parse::<u64>().ok())
.unwrap_or(0u64);
 
DuplicateFinder::search(dir, min_size)?
.iter()
.rev()
.filter(|(_, files)| files.len() > 1)
.for_each(|((size, _), files)| {
println!("Size: {}", size);
 
files
.iter()
.for_each(|file| println!("{}", file.to_string_lossy()));
 
println!();
});
 
Ok(())
}</syntaxhighlight>
 
=={{header|Sidef}}==
It uses the portable ''File::Find'' module which means that it should work, virtually, on any platform.
<langsyntaxhighlight lang="ruby"># usage: sidef fdf.sf [size] [dir1] [...]
 
require('File::Find')
Line 518 ⟶ 1,773:
say "#{files.sort.join(%Q[\n])}\n#{'-'*80}"
}
}</langsyntaxhighlight>
Section of sample output:
<pre>% sidef fdf.sf 0 /tmp /usr/bin
Line 545 ⟶ 1,800:
{{tcllib|fileutil}}
{{tcllib|md5}}
<langsyntaxhighlight lang="tcl">package require fileutil
package require md5
 
proc finddupfiles {dir {minsize 1}} {
foreach fn [fileutil::find $dir] {
file lstat $fn stat
if {$stat(size) < $minsize} continue
dict lappend byino $stat(dev),$stat(ino) $fn
if {$stat(type) ne "file"} continue
set f [open $fn "rb"]
set content [read $f]
close $f
set md5 [md5::md5 -hex $content]
dict lappend byhash $md5 $fn
}
set groups {}
foreach group [dict values $byino] {
if {[llength $group] <= 1} continue
set gs [lsort $group]
dict set groups [lindex $gs 0] $gs
}
foreach group [dict values $byhash] {
if {[llength $group] <= 1} continue
foreach f $group {
if {[dict exists $groups $f]} {
dict set groups $f [lsort -unique \
[concat [dict get $groups $f] $group]]
unset group
break
}
}
}
if {[info exist group]} {
set gs [lsort $group]
dict set groups [lindex $gs 0] $gs
}
}
}
set masters {}
dict for {n g} $groups {
lappend masters [list $n [llength $g],$n]
}
set result {}
foreach p [lsort -decreasing -index 1 -dictionary $masters] {
set n [lindex $p 0]
lappend result $n [dict get $groups $n]
}
return $result
Line 597 ⟶ 1,852:
set n 0
foreach d $dupes {
if {$d ne $leader} {
puts " dupe #[incr n]: $d"
}
}
}
}</lang>
}</syntaxhighlight>
Section of sample output:
<pre>
Line 610 ⟶ 1,865:
./compat/zlib/contrib/delphi/zlibd32.mak has duplicates
dupe #1: ./compat/zlib/contrib/pascal/zlibd32.mak
</pre>
 
=={{header|Wren}}==
{{libheader|Wren-crypto}}
{{libheader|Wren-sort}}
<syntaxhighlight lang="wren">import "io" for Directory, File, Stat
import "./crypto" for Sha1
import "./sort" for Sort
 
var findDuplicates = Fn.new { |dir, minSize|
if (!Directory.exists(dir)) Fiber.abort("Directory does not exist.")
var files = Directory.list(dir).where { |f| Stat.path("%(dir)/%(f)").size >= minSize }
var hashMap = {}
for (file in files) {
var path = "%(dir)/%(file)"
if (Stat.path(path).isDirectory) continue
var contents = File.read(path)
var hash = Sha1.digest(contents)
var exists = hashMap.containsKey(hash)
if (exists) {
hashMap[hash].add(file)
} else {
hashMap[hash] = [file]
}
}
var duplicates = []
for (key in hashMap.keys) {
if (hashMap[key].count > 1) {
var files = hashMap[key]
var path = "%(dir)/%(files[0])"
var size = Stat.path(path).size
duplicates.add([size, files])
}
}
var cmp = Fn.new { |i, j| (j[0] - i[0]).sign } // by decreasing size
Sort.insertion(duplicates, cmp)
System.print("The sets of duplicate files are:\n")
for (dup in duplicates) {
System.print("Size %(dup[0]) bytes:")
System.print(dup[1].join("\n"))
System.print()
}
}
 
findDuplicates.call("./", 1000)</syntaxhighlight>
 
{{out}}
Sample output:
<pre>
The sets of duplicate files are:
 
Size 57221 bytes:
big.wren
big2.wren
 
Size 16696 bytes:
cls
clsc
 
Size 4096 bytes:
data.blk
data2.blk
 
Size 1415 bytes:
circular.wren
circular2.wren
</pre>
 
Line 618 ⟶ 1,939:
 
File findDupFiles.zkl:
<langsyntaxhighlight lang="zkl">include(zkl.h.zkl);
const FLAGS=FILE.GLOB.IGNORE_CASE + FILE.GLOB.NO_DIRS;
var [const] MsgHash=Import("zklMsgHash");
Line 625 ⟶ 1,946:
argh:=Utils.Argh(
T("+R","R","Recurse into subdirectories, starting at <arg>",
fcn(arg){ recurse=arg }),
T("+minSz","","Only consider files larger than <arg>",
fcn(arg){ minSz=arg.toInt() }),
T("+maxSz","","Only consider files less than <arg>",
fcn(arg){ maxSz=arg.toInt() }),
);
 
Line 641 ⟶ 1,962:
fnames:=Data(0,String);
if (recurse) File.globular(recurse,fileSpec,True,FLAGS,fnames);
else File.glob(fileSpec,FLAGS).pump(fnames);
 
files:=Dictionary(); // (len:(name,name...), ...)
Line 654 ⟶ 1,975:
files.apply("len").sum(0)));
 
if(not files) System.exit(); // no files found
 
buffer:=Data(0d100_000); // we'll resuse this buffer for hashing
Line 671 ⟶ 1,992:
println("Found %d duplicate files:".fmt(files.apply("len").sum(0)));
foreach group in (files){ println(" ",group.concat(", ")) }</langsyntaxhighlight>
{{out}}
<pre>
2,458

edits