Find duplicate files: Difference between revisions
→{{header|jq}}: -R
m (added whitespace and a ;Task:.) |
(→{{header|jq}}: -R) |
||
(13 intermediate revisions by 9 users not shown) | |||
Line 14:
For extra points, detect when whole directory sub-trees are identical, or optionally remove or link identical files.
<br><br>
=={{header|C++}}==
The code uses
- xxhash_cpp downloaded from https://github.com/RedSpah/xxhash_cpp
- boost filesystem, boost format, and boost iostreams
Compiles on Windows10 and Linux.
<syntaxhighlight lang="cpp">
#include<iostream>
#include<string>
#include<boost/filesystem.hpp>
#include<boost/format.hpp>
#include<boost/iostreams/device/mapped_file.hpp>
#include<optional>
#include<algorithm>
#include<iterator>
#include<execution>
#include"dependencies/xxhash.hpp" // https://github.com/RedSpah/xxhash_cpp
/**
* Find ranges (neighbouring elements) of the same value within [begin, end[ and
* call callback for each such range
* @param begin start of container
* @param end end of container (1 beyond last element)
* @param function returns value for each iterator V(*T&)
* @param callback void(start, end, value)
* @return number of range
*/
template<typename T, typename V, typename F>
size_t for_each_adjacent_range(T begin, T end, V getvalue, F callback) {
size_t partitions = 0;
while (begin != end) {
auto const& value = getvalue(*begin);
auto current = begin;
while (++current != end && getvalue(*current) == value);
callback(begin, current, value);
++partitions;
begin = current;
}
return partitions;
}
namespace bi = boost::iostreams;
namespace fs = boost::filesystem;
struct file_entry {
public:
explicit file_entry(fs::directory_entry const & entry)
: path_{entry.path()}, size_{fs::file_size(entry)}
{}
auto size() const { return size_; }
auto const& path() const { return path_; }
auto get_hash() {
if (!hash_)
hash_ = compute_hash();
return *hash_;
}
private:
xxh::hash64_t compute_hash() {
bi::mapped_file_source source;
source.open<fs::wpath>(this->path());
if (!source.is_open()) {
std::cerr << "Cannot open " << path() << std::endl;
throw std::runtime_error("Cannot open file");
}
xxh::hash_state64_t hash_stream;
hash_stream.update(source.data(), size_);
return hash_stream.digest();
}
private:
fs::wpath path_;
uintmax_t size_;
std::optional<xxh::hash64_t> hash_;
};
using vector_type = std::vector<file_entry>;
using iterator_type = vector_type::iterator;
auto find_files_in_dir(fs::wpath const& path, vector_type& file_vector, uintmax_t min_size = 1) {
size_t found = 0, ignored = 0;
if (!fs::is_directory(path)) {
std::cerr << path << " is not a directory!" << std::endl;
}
else {
std::cerr << "Searching " << path << std::endl;
for (auto& e : fs::recursive_directory_iterator(path)) {
++found;
if (fs::is_regular_file(e) && fs::file_size(e) >= min_size)
file_vector.emplace_back(e);
else ++ignored;
}
}
return std::make_tuple(found, ignored);
}
int main(int argn, char* argv[])
{
vector_type files;
for (auto i = 1; i < argn; ++i) {
fs::wpath path(argv[i]);
auto [found, ignored] = find_files_in_dir(path, files);
std::cerr << boost::format{
" %1$6d files found\n"
" %2$6d files ignored\n"
" %3$6d files added\n" } % found % ignored % (found - ignored)
<< std::endl;
}
std::cerr << "Found " << files.size() << " regular files" << std::endl;
// sort files in descending order by file size
std::sort(std::execution::par_unseq, files.begin(), files.end()
, [](auto const& a, auto const& b) { return a.size() > b.size(); }
);
for_each_adjacent_range(
std::begin(files)
, std::end(files)
, [](vector_type::value_type const& f) { return f.size(); }
, [](auto start, auto end, auto file_size) {
// Files with same size
size_t nr_of_files = std::distance(start, end);
if (nr_of_files > 1) {
// sort range start-end by hash
std::sort(start, end, [](auto& a, auto& b) {
auto const& ha = a.get_hash();
auto const& hb = b.get_hash();
auto const& pa = a.path();
auto const& pb = b.path();
return std::tie(ha, pa) < std::tie(hb, pb);
});
for_each_adjacent_range(
start
, end
, [](vector_type::value_type& f) { return f.get_hash(); }
, [file_size](auto hstart, auto hend, auto hash) {
// Files with same size and same hash are assumed to be identical
// could resort to compare files byte-by-byte now
size_t hnr_of_files = std::distance(hstart, hend);
if (hnr_of_files > 1) {
std::cout << boost::format{ "%1$3d files with hash %3$016x and size %2$d\n" }
% hnr_of_files % file_size % hash;
std::for_each(hstart, hend, [hash, file_size](auto& e) {
std::cout << '\t' << e.path() << '\n';
}
);
}
}
);
}
}
);
return 0;
}
</syntaxhighlight>
{{out}}
<pre>$ ./duplicates /usr/include/boost/
Searching "/usr/include/boost/"
15264 files found
1160 files ignored
14104 files added
Found 14104 regular files
2 files with hash 0c5e81a47dd8cd99 and size 15811
"/usr/include/boost/mpl/vector/aux_/preprocessed/no_ctps/vector50_c.hpp"
"/usr/include/boost/mpl/vector/aux_/preprocessed/plain/vector50_c.hpp"
3 files with hash 0f2775c41bb647f3 and size 14766
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/vector.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/vector.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/vector.hpp"
3 files with hash f9e02ecc3e38f3a3 and size 14714
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/deque.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/deque.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/deque.hpp"
3 files with hash 73ed6d15fd62f8b3 and size 14620
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/list.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/list.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/list.hpp"
3 files with hash 7a43c97436ae1913 and size 14547
"/usr/include/boost/mpl/aux_/preprocessed/msvc60/set.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/msvc70/set.hpp"
"/usr/include/boost/mpl/aux_/preprocessed/no_ctps/set.hpp"
...
</pre>
=={{header|Elixir}}==
{{trans|Ruby}}
<
def find_duplicate_files(dir) do
IO.puts "\nDirectory : #{dir}"
Line 38 ⟶ 225:
end
hd(System.argv) |> Files.find_duplicate_files</
{{out}}
Line 62 ⟶ 249:
=={{header|Go}}==
In theory this should work on any of the operating systems supported by Go (Linux, macOS, Windows, OpenBSD etc.) though only tested on Ubuntu 16.04.
<
import (
Line 132 ⟶ 319:
fmt.Println()
}
}</
{{out}}
Line 160 ⟶ 347:
- works on Windows as well as Unix Systems (tested with Mint 17 / Windows 7)
</pre>
<syntaxhighlight lang="haskell">
import Crypto.Hash.MD5 (hash)
import Data.ByteString as BS (readFile, ByteString())
Line 224 ⟶ 411:
printf "Something went wrong - please use ./%s <dir> <bytes>\n" name
</syntaxhighlight>
Example output:
Line 264 ⟶ 451:
Hard links are indicated by displaying the files on the same line separated by "=".
MD5 checksums are used to detect duplicate files.
<
import java.nio.*;
import java.nio.file.*;
Line 401 ⟶ 588:
return Integer.compare(len1, len2);
}
}</
{{out}}
Line 416 ⟶ 603:
file1 = file3
file6
</pre>
=={{header|jq}}==
'''Works with jq, the C implementation of jq'''
'''Works with gojq, the Go implementation of jq'''
'''Works with jaq, the Rust implementation of jq'''
This entry illustrates how jq plays nicely with other command-line
tools -
in this case jc (https://kellyjonbrazil.github.io/jc) is used to JSONify the output of `md5sum`,
the idea being that in practice, two files will almost surely have the same
contents if their md5sum values are the same. It should be noted, however,
that jq can also easily be used to parse the raw output of `md5sum` by using the -R option;
the modifications required for the jq program given below would all be trivial to make.
We'll also use the `-type f` option of the `find` command as this excludes symbolic links,
which we'll assume should be ignored.
An appropriate invocation of these three command-line tools would be along the lines of:
<pre>
jc md5sum $(find . -type f ) | jq -nc -f find-duplicate-files.jq
</pre>
The output will be a stream of arrays, each array listing the files with the same hash.
<syntaxhighlight lang="jq">
# The following jq program assumes the input consists of a JSON array of objects having
# keys named "hash" and "filename".
def dictionary(stream; f; g):
reduce stream as $x ({}; .[($x|f)] += [$x|g]);
dictionary(inputs[]; .hash; .filename)
| to_entries[].value
| select(length > 1)
| [.[]]
</syntaxhighlight>
{{output}}
<pre>
["./toplevel.txt","./tmp/toplevel.txt"]
</pre>
Line 423 ⟶ 651:
Should work on Windows, macOS and Linux.
<
function find_duplicates(path::String, minsize::Int = 0)
Line 467 ⟶ 695:
end
main()</
{{out}}
Line 488 ⟶ 716:
</pre>
=={{header|Mathematica}}/{{header|Wolfram Language}}==
<syntaxhighlight lang="mathematica">hash="SHA256";
minSize=Quantity[1,"Megabytes"];
allfiles=Once@Select[FileNames["*","",∞],!Once@DirectoryQ[#]&&Once@FileSize[#]>minSize&];
data={#,Once[FileHash[#,hash,All,"HexString"]]}&/@allfiles[[;;5]];
Grid[Select[GatherBy[data,Last],Length[#]>1&][[All,All,1]]]</
{{out|Sample output}}
sample directory:
Line 501 ⟶ 728:
someFile3 bef0039c33277f743b60b0076871110b96e14de34045aafc8e764349de6043b5
directory\someFile eebe4df6d2951e77973b83af039f6565b215f74113028bbc5d8f96b856947abe
directory\someFile4 e6385b50ec8b052b141588573f680261db714babe534d8ced8a17985b14f58e9</pre>
sample output:
<pre>35 MB {someFile,directory\someFile}</pre>
=={{header|Nim}}==
Our solution works on Linux and likely on any Posix system. To mark hard links, we provide the inode number of the file. Two paths with the same inode number are in fact two links to the same file. To make them more visible, an asterisk is used.
The detection of hard links may be OS dependent and may not work on Windows.
<syntaxhighlight lang="nim">import algorithm
import os
import strformat
import strutils
import tables
import std/sha1
import times
type
# Mapping "size" -> "list of paths".
PathsFromSizes = Table[BiggestInt, seq[string]]
# Mapping "hash" -> "list fo paths".
PathsFromHashes = Table[string, seq[string]]
# Information data.
Info = tuple[size: BiggestInt; paths: seq[string]]
#---------------------------------------------------------------------------------------------------
proc processCmdLine(): tuple[dirpath: string; minsize: Natural] =
## Process the command line. Extra parameters are ignored.
if paramCount() == 0:
quit fmt"Usage: {getAppFileName().splitPath()[1]} folder minsize"
result.dirpath = paramStr(1)
if not result.dirpath.dirExists():
quit fmt"Wrong directory path: {result.dirpath}"
if paramCount() >= 2:
try:
result.minsize = parseInt(paramStr(2))
except ValueError:
quit fmt"Wrong minimum size: {paramStr(2)}"
#---------------------------------------------------------------------------------------------------
proc initPathsFromSize(dirpath: string; minsize: Natural): PathsFromSizes =
## Retrieve the files in directory "dirpath" with minimal size "minsize"
## and build the mapping from size to paths.
for path in dirpath.walkDirRec():
if not path.fileExists():
continue # Not a regular file.
let size = path.getFileSize()
if size >= minSize:
# Store path in "size to paths" table.
result.mgetOrPut(size, @[]).add(path)
#---------------------------------------------------------------------------------------------------
proc initPathsFromHashes(pathsFromSizes: PathsFromSizes): PathsFromHashes =
## Compute hashes for files whose size is not unique and build the mapping
## from hash to paths.
for size, paths in pathsFromSizes.pairs:
if paths.len > 1:
for path in paths:
# Store path in "digest to paths" table.
result.mgetOrPut($path.secureHashFile(), @[]).add(path)
#---------------------------------------------------------------------------------------------------
proc cmp(x, y: Info): int =
## Compare two information tuples. Used to sort the list of duplicates files.
result = cmp(x.size, y.size)
if result == 0:
# Same size. Compare the first paths (we are sure that they are different).
result = cmp(x.paths[0], y.paths[0])
#---------------------------------------------------------------------------------------------------
proc displayDuplicates(dirpath: string; pathsFromHashes: PathsFromHashes) =
## Display duplicates files in directory "dirpath".
echo "Files with same size and same SHA1 hash value in directory: ", dirpath
echo ""
# Build list of duplicates.
var duplicates: seq[Info]
for paths in pathsFromHashes.values:
if paths.len > 1:
duplicates.add((paths[0].getFileSize(), sorted(paths)))
if duplicates.len == 0:
echo "No files"
return
duplicates.sort(cmp, Descending)
# Display duplicates.
echo fmt"""{"Size":>10} {"Last date modified":^19} {"Inode":>8} HL File name"""
echo repeat('=', 80)
for (size, paths) in duplicates:
echo ""
for path in paths:
let mtime = path.getLastModificationTime().format("YYYY-MM-dd HH:mm:ss")
let info = path.getFileInfo()
let inode = info.id.file
let hardlink = if info.linkCount == 1: " " else: "*"
echo fmt"{size:>10} {mtime:>23} {inode:>12} {hardlink:<5} {path.relativePath(dirpath)}"
#———————————————————————————————————————————————————————————————————————————————————————————————————
let (dirpath, minsize) = processCmdLine()
let pathsFromSizes = initPathsFromSize(dirpath, minsize)
let pathsFromHashes = initPathsFromHashes(pathsFromSizes)
dirpath.displayDuplicates(pathsFromHashes)</syntaxhighlight>
{{out}}
<pre>Files with same size and same SHA1 hash value in directory: .
Size Last date modified Inode HL File name
================================================================================
499515 2020-12-10 22:48:06 12981503 subdir/tree.ppm
499515 2020-12-10 22:45:26 12722201 * subdir/tree1.ppm
499515 2020-12-10 22:45:26 12722201 * tree.ppm
499515 2020-12-10 22:47:51 12722205 tree1.ppm
65322 2020-12-10 22:44:53 12722178 * house.jpg
65322 2020-12-10 22:44:53 12722178 * house1.jpeg
6401 2020-12-10 22:45:07 12722182 dragon.png
6401 2020-12-10 22:45:53 12722204 dragon1.png
6401 2020-12-10 22:46:21 12981502 subdir/dragon.png</pre>
=={{header|Objeck}}==
Solution works on Windows, macOS and Linux.
<
use System.Time;
use Collection;
Line 593 ⟶ 954:
return "{$@name}, {$@size}, {$date_str}";
}
}</
{{output}}
Line 609 ⟶ 970:
/tmp/foo/ee.obe, 20020, 3/29/2019 8:47:33 PM
/tmp/foo/dd.obe, 20020, 3/29/2019 8:47:14 PM
</pre>
=={{header|OCaml}}==
Although this solution uses the Unix module, it only calls lstat from there, which exists in the Windows port of the Unix module and so should be portable.
<syntaxhighlight lang="ocaml">let readdir_or_empty dir =
try Sys.readdir dir
with Sys_error e ->
prerr_endline ("Could not read dir " ^ dir ^ ": " ^ e);
[||]
let directory_walk root func =
let rec aux dir =
readdir_or_empty dir
|> Array.iter (fun filename ->
let path = Filename.concat dir filename in
let open Unix in
let stat = lstat path in
match stat.st_kind with
| S_DIR -> aux path
| S_REG -> func path stat
| _ -> ())
in
aux root
let rec input_retry ic buf pos len =
let count = input ic buf pos len in
if count = 0 || count = len then count + pos
else input_retry ic buf (pos + count) (len - count)
let with_file_in_bin fn f =
let fh = open_in_bin fn in
Fun.protect ~finally:(fun () -> close_in fh) (fun () -> f fh)
let is_really_same_file fn1 fn2 =
with_file_in_bin fn1 (fun fh1 ->
with_file_in_bin fn2 (fun fh2 ->
let len = 2048 in
let buf1 = Bytes.create len in
let buf2 = Bytes.create len in
let rec aux () =
let read1 = input_retry fh1 buf1 0 len in
let read2 = input_retry fh2 buf2 0 len in
if read1 <> read2 || buf1 <> buf2 then false
else if read1 = 0 then true
else aux ()
in
aux ()))
let () =
let tbl = Hashtbl.create 128 in
let seen = Hashtbl.create 128 in
let min_size = int_of_string Sys.argv.(2) in
directory_walk Sys.argv.(1) (fun path stat ->
try
let identity_tuple = (stat.st_dev, stat.st_ino) in
match Hashtbl.find_opt seen identity_tuple with
| Some existing ->
print_endline
("File " ^ existing ^ " is the same hard link as " ^ path)
| None -> (
Hashtbl.add seen identity_tuple path;
let size = stat.st_size in
if size >= min_size then
let digest = Digest.file path in
Hashtbl.find_all tbl digest
|> List.find_opt (is_really_same_file path)
|> function
| Some existing ->
print_endline ("File " ^ existing ^ " matches " ^ path)
| None -> Hashtbl.add tbl digest path)
with Sys_error e -> prerr_endline ("Could not hash " ^ path ^ ": " ^ e))
</syntaxhighlight>
{{output}}
<pre>$ dune build @fmt
$ dune exec ./finddupes.exe . 1024
File ./finddupes.ml matches ./_build/default/finddupes.ml
File ./finddupes.ml matches ./_build/default/.formatted/finddupes.ml
</pre>
=={{header|Perl}}==
For supplied directory, compare all files, recursing into sub-directories. By default, showing duplicate files of 1 byte or larger, configurable with command-line option. Using CPAN <code>File</code> modules for enhanced portability.
<
use File::Compare qw(compare);
use Sort::Naturally;
Line 651 ⟶ 1,090:
}
print for find_dups(@ARGV);</
{{out}}
<pre> 372 aaa.txt, dir2/aaa.txt
Line 658 ⟶ 1,097:
=={{header|Phix}}==
Works on Windows and Linux. No handling of hard (or soft) links.
<!--<syntaxhighlight lang="phix">(notonline)-->
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- file i/o</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">min_size</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
<span style="color: #004080;">atom</span> <span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"backup"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">filepath</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">-- (example filter)</span>
<span style="color: #008080;">and</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'d'</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_ATTRIBUTES</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
<span style="color: #004080;">atom</span> <span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">[</span><span style="color: #004600;">D_SIZE</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">>=</span><span style="color: #000000;">min_size</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">size</span><span style="color: #0000FF;">,</span><span style="color: #000000;">filepath</span><span style="color: #0000FF;">,</span><span style="color: #000000;">dir_entry</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d files found\r"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
<span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #000080;font-style:italic;">-- keep going</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">exit_code</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">walk_dir</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"demo\\clocks\\love"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">store_res</span><span style="color: #0000FF;">,</span> <span style="color: #004600;">true</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span><span style="color: #000000;">DESCENDING</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d files found\n"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">))</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">duplicates</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)-</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #000000;">i</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]!=</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">si</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join_path</span><span style="color: #0000FF;">({</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">3</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]}),</span>
<span style="color: #000000;">sj</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">join_path</span><span style="color: #0000FF;">({</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">][</span><span style="color: #000000;">3</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]})</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fni</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">si</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">),</span>
<span style="color: #000000;">fnj</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">sj</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">),</span>
<span style="color: #000000;">size</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span>
<span style="color: #004080;">bool</span> <span style="color: #000000;">same</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">true</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">fni</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">or</span> <span style="color: #000000;">fnj</span><span style="color: #0000FF;">=-</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #000000;">size</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span> <span style="color: #000080;font-style:italic;">-- (check eof as well)</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fni</span><span style="color: #0000FF;">)!=</span><span style="color: #7060A8;">getc</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fnj</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">same</span> <span style="color: #0000FF;">=</span> <span style="color: #004600;">false</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fni</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fnj</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">same</span> <span style="color: #008080;">then</span>
<span style="color: #000080;font-style:italic;">-- prettifying the output left as an exercise...</span>
<span style="color: #0000FF;">?</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
<span style="color: #0000FF;">?</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">]</span>
<span style="color: #000000;">duplicates</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()></span><span style="color: #000000;">t1</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"processing %d/%d...\r"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">i</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)})</span>
<span style="color: #000000;">t1</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">time</span><span style="color: #0000FF;">()+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"%d duplicates found\n"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">duplicates</span><span style="color: #0000FF;">)</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
Line 737 ⟶ 1,179:
=={{header|PicoLisp}}==
File duplicates in /bin dir on Void Linux. Hash provided by xxhash library via mmap.
<
(de mmap (L F)
(native "@" "mmap" 'N 0 L 1 2 F 0) )
Line 770 ⟶ 1,212:
group
(cadr Lst) )
(and (filter cdr L) (println (car Lst) @)) ) )</
{{out}}
<pre>
Line 792 ⟶ 1,234:
=={{header|Python}}==
<
import os
import hashlib
Line 843 ⟶ 1,285:
FindDuplicateFiles('/home/tim/Dropbox', 1024*1024)
</syntaxhighlight>
=={{header|Racket}}==
<
#lang racket
Line 886 ⟶ 1,328:
(show-duplicates (find-system-path 'home-dir) 1024)
</syntaxhighlight>
=={{header|Raku}}==
(formerly Perl 6)
This implementation takes a starting directory (defaults to the current directory) and has a few flags to set behaviour: --minsize, minimum file size to look at, defaults to 5 bytes; and --recurse, recurse into the directory structure, default True. It finds files of the same size, calculates hashes to compare, then reports files that hash the same.
<syntaxhighlight lang="raku" line>use Digest::SHA256::Native;
sub MAIN( $dir = '.', :$minsize = 5, :$recurse = True ) {
Line 910 ⟶ 1,352:
for %files.sort( +*.key ).grep( *.value.elems > 1)».kv -> ($size, @list) {
my %dups;
@list.map: { %dups{
for %dups.grep( *.value.elems > 1)».value -> @dups {
say sprintf("%9s : ", scale $size ), @dups.join(', ');
Line 924 ⟶ 1,366:
default { ($bytes / 2**30).round(.1) ~ ' GB' }
}
}</
{{out|Sample output}}
Passing in command line switches: --minsize=0 --recurse=False /home/me/p6
Line 938 ⟶ 1,380:
<br>Note that the '''tFID''' (temp) file is hard coded to the '''C:''' drive.
<br>Only minimal error checking is performed.
<
sep=center(' files are identical in size and content: ',79,"═") /*define the header. */
tFID= 'c:\TEMP\FINDDUP.TMP' /*use this as a temporary FileID. */
Line 963 ⟶ 1,405:
if lines(tFID)\==0 then 'ERASE' tFID /*do housecleaning (delete temp file).*/
/*stick a fork in it, we're all done. */</
'''output''' when using (checking) with the default root directory:
<pre>
Line 987 ⟶ 1,429:
::* uses variables for some command names and command options
::* shows the number of files examined and also the directory name
<
sep=center(' files are identical in size and content: ',79,"═") /*define the header. */
parse arg !; if !all(arg()) then exit /*boilerplate HELP(?)*/
Line 1,043 ⟶ 1,485:
novalue: syntax: call err 'REXX program' condition("C") 'error',condition("D"),'REXX source statement (line' sigl"):",sourceline(sigl)
p: return word(arg(1),1)
s: if arg(1)==1 then return arg(3); return word(arg(2) 's',1)</
'''output''' when using the DIR (folder): H:\#\REX
<pre>
Line 1,066 ⟶ 1,508:
=={{header|Ring}}==
<
# Project : Find duplicate files
Line 1,111 ⟶ 1,553:
next
return alist
</syntaxhighlight>
Output:
<pre>
Line 1,144 ⟶ 1,586:
=={{header|Ruby}}==
It confirms once by the file size. When the same, it confirms a digest (md5).
<
def find_duplicate_files(dir)
Line 1,161 ⟶ 1,603:
end
find_duplicate_files("/Windows/System32")</
Sample Output:
Line 1,183 ⟶ 1,625:
=={{header|Rust}}==
<
collections::BTreeMap,
fs::{read_dir, File},
Line 1,279 ⟶ 1,721:
Ok(())
}</
=={{header|Sidef}}==
It uses the portable ''File::Find'' module which means that it should work, virtually, on any platform.
<
require('File::Find')
Line 1,331 ⟶ 1,773:
say "#{files.sort.join(%Q[\n])}\n#{'-'*80}"
}
}</
Section of sample output:
<pre>% sidef fdf.sf 0 /tmp /usr/bin
Line 1,358 ⟶ 1,800:
{{tcllib|fileutil}}
{{tcllib|md5}}
<
package require md5
Line 1,414 ⟶ 1,856:
}
}
}</
Section of sample output:
<pre>
Line 1,423 ⟶ 1,865:
./compat/zlib/contrib/delphi/zlibd32.mak has duplicates
dupe #1: ./compat/zlib/contrib/pascal/zlibd32.mak
</pre>
=={{header|Wren}}==
{{libheader|Wren-crypto}}
{{libheader|Wren-sort}}
<syntaxhighlight lang="wren">import "io" for Directory, File, Stat
import "./crypto" for Sha1
import "./sort" for Sort
var findDuplicates = Fn.new { |dir, minSize|
if (!Directory.exists(dir)) Fiber.abort("Directory does not exist.")
var files = Directory.list(dir).where { |f| Stat.path("%(dir)/%(f)").size >= minSize }
var hashMap = {}
for (file in files) {
var path = "%(dir)/%(file)"
if (Stat.path(path).isDirectory) continue
var contents = File.read(path)
var hash = Sha1.digest(contents)
var exists = hashMap.containsKey(hash)
if (exists) {
hashMap[hash].add(file)
} else {
hashMap[hash] = [file]
}
}
var duplicates = []
for (key in hashMap.keys) {
if (hashMap[key].count > 1) {
var files = hashMap[key]
var path = "%(dir)/%(files[0])"
var size = Stat.path(path).size
duplicates.add([size, files])
}
}
var cmp = Fn.new { |i, j| (j[0] - i[0]).sign } // by decreasing size
Sort.insertion(duplicates, cmp)
System.print("The sets of duplicate files are:\n")
for (dup in duplicates) {
System.print("Size %(dup[0]) bytes:")
System.print(dup[1].join("\n"))
System.print()
}
}
findDuplicates.call("./", 1000)</syntaxhighlight>
{{out}}
Sample output:
<pre>
The sets of duplicate files are:
Size 57221 bytes:
big.wren
big2.wren
Size 16696 bytes:
cls
clsc
Size 4096 bytes:
data.blk
data2.blk
Size 1415 bytes:
circular.wren
circular2.wren
</pre>
Line 1,431 ⟶ 1,939:
File findDupFiles.zkl:
<
const FLAGS=FILE.GLOB.IGNORE_CASE + FILE.GLOB.NO_DIRS;
var [const] MsgHash=Import("zklMsgHash");
Line 1,484 ⟶ 1,992:
println("Found %d duplicate files:".fmt(files.apply("len").sum(0)));
foreach group in (files){ println(" ",group.concat(", ")) }</
{{out}}
<pre>
|