Find duplicate files

From Rosetta Code
Task
Find duplicate files
You are encouraged to solve this task according to the task description, using any language you may know.

In a large directory structure it is easy to inadvertently leave unnecessary copies of files around, which can use considerable disk space and create confusion.


Task

Create a program which, given a minimum size and a folder/directory, will find all files of at least size bytes with duplicate contents under the directory and output or show the sets of duplicate files in order of decreasing size.

The program may be command-line or graphical, and duplicate content may be determined by direct comparison or by calculating a hash of the data.

Specify which filesystems or operating systems your program works with if it has any filesystem- or OS-specific requirements.

Identify hard links (filenames referencing the same content) in the output if applicable for the filesystem.

For extra points, detect when whole directory sub-trees are identical, or optionally remove or link identical files.

C++

The code uses

- xxhash_cpp downloaded from https://github.com/RedSpah/xxhash_cpp

- boost filesystem, boost format, and boost iostreams

Compiles on Windows10 and Linux.

#include<iostream>
#include<string>
#include<boost/filesystem.hpp>
#include<boost/format.hpp>
#include<boost/iostreams/device/mapped_file.hpp>
#include<optional>
#include<algorithm>
#include<iterator>
#include<execution>
#include"dependencies/xxhash.hpp" // https://github.com/RedSpah/xxhash_cpp

/**
* Find ranges (neighbouring elements) of the same value within [begin, end[ and
* call callback for each such range
* @param begin start of container
* @param end end of container (1 beyond last element)
* @param function returns value for each iterator V(*T&)
* @param callback void(start, end, value)
* @return number of range
*/
template<typename  T, typename V, typename F>
size_t for_each_adjacent_range(T begin, T end, V getvalue, F callback) {
    size_t partitions = 0;
    while (begin != end) {
        auto const& value = getvalue(*begin);
        auto current = begin;
        while (++current != end && getvalue(*current) == value);
        callback(begin, current, value);
        ++partitions;
        begin = current;
    }
    return partitions;
}

namespace bi = boost::iostreams;
namespace fs = boost::filesystem;

struct file_entry {
public:
    explicit file_entry(fs::directory_entry const & entry) 
        : path_{entry.path()}, size_{fs::file_size(entry)}
    {}
    auto size() const { return size_; }
    auto const& path() const { return path_; }
    auto get_hash() {
        if (!hash_)
            hash_ = compute_hash();
        return *hash_;
    }
private:
    xxh::hash64_t compute_hash() {
        bi::mapped_file_source source;
        source.open<fs::wpath>(this->path());
        if (!source.is_open()) {
            std::cerr << "Cannot open " << path() << std::endl;
            throw std::runtime_error("Cannot open file");
        }
        xxh::hash_state64_t hash_stream;
        hash_stream.update(source.data(), size_);
        return hash_stream.digest();
    }
private:
    fs::wpath path_;
    uintmax_t size_;
    std::optional<xxh::hash64_t> hash_;
};

using vector_type = std::vector<file_entry>;
using iterator_type = vector_type::iterator;

auto find_files_in_dir(fs::wpath const& path, vector_type& file_vector, uintmax_t min_size = 1) {
    size_t found = 0, ignored = 0;
    if (!fs::is_directory(path)) {
        std::cerr << path << " is not a directory!" << std::endl;
    }
    else {
        std::cerr << "Searching " << path << std::endl;

        for (auto& e : fs::recursive_directory_iterator(path)) {
            ++found;
            if (fs::is_regular_file(e) && fs::file_size(e) >= min_size)
                file_vector.emplace_back(e);
            else ++ignored;
        }
    }
    return std::make_tuple(found, ignored);
}

int main(int argn, char* argv[])
{
    vector_type files;
    for (auto i = 1; i < argn; ++i) {
        fs::wpath path(argv[i]);
        auto [found, ignored] = find_files_in_dir(path, files);
        std::cerr << boost::format{
            "  %1$6d files found\n"
            "  %2$6d files ignored\n"
            "  %3$6d files added\n" } % found % ignored % (found - ignored) 
            << std::endl;
    }

    std::cerr << "Found " << files.size() << " regular files" << std::endl;
    // sort files in descending order by file size
    std::sort(std::execution::par_unseq, files.begin(), files.end()
        , [](auto const& a, auto const& b) { return a.size() > b.size(); }
    );
    for_each_adjacent_range(
        std::begin(files)
        , std::end(files)
        , [](vector_type::value_type const& f) { return f.size(); }
        , [](auto start, auto end, auto file_size) {
            // Files with same size
            size_t nr_of_files = std::distance(start, end);
            if (nr_of_files > 1) {
                // sort range start-end by hash
                std::sort(start, end, [](auto& a, auto& b) { 
                    auto const& ha = a.get_hash();
                    auto const& hb = b.get_hash();
                    auto const& pa = a.path();
                    auto const& pb = b.path();
                    return std::tie(ha, pa) < std::tie(hb, pb); 
                    });
                for_each_adjacent_range(
                    start
                    , end
                    , [](vector_type::value_type& f) { return f.get_hash(); }
                    , [file_size](auto hstart, auto hend, auto hash) {
                        // Files with same size and same hash are assumed to be identical
                        // could resort to compare files byte-by-byte now
                        size_t hnr_of_files = std::distance(hstart, hend);
                        if (hnr_of_files > 1) {
                            std::cout << boost::format{ "%1$3d files with hash %3$016x and size %2$d\n" } 
                                % hnr_of_files % file_size % hash;
                            std::for_each(hstart, hend, [hash, file_size](auto& e) {
                                std::cout << '\t' << e.path() << '\n';
                                }
                            );
                        }
                    }
                );
            }
        }
    );
    
    return 0;
}
Output:
$ ./duplicates /usr/include/boost/
Searching "/usr/include/boost/"
   15264 files found
    1160 files ignored
   14104 files added

Found 14104 regular files
  2 files with hash 0c5e81a47dd8cd99 and size 15811
        "/usr/include/boost/mpl/vector/aux_/preprocessed/no_ctps/vector50_c.hpp"
        "/usr/include/boost/mpl/vector/aux_/preprocessed/plain/vector50_c.hpp"
  3 files with hash 0f2775c41bb647f3 and size 14766
        "/usr/include/boost/mpl/aux_/preprocessed/msvc60/vector.hpp"
        "/usr/include/boost/mpl/aux_/preprocessed/msvc70/vector.hpp"
        "/usr/include/boost/mpl/aux_/preprocessed/no_ctps/vector.hpp"
  3 files with hash f9e02ecc3e38f3a3 and size 14714
        "/usr/include/boost/mpl/aux_/preprocessed/msvc60/deque.hpp"
        "/usr/include/boost/mpl/aux_/preprocessed/msvc70/deque.hpp"
        "/usr/include/boost/mpl/aux_/preprocessed/no_ctps/deque.hpp"
  3 files with hash 73ed6d15fd62f8b3 and size 14620
        "/usr/include/boost/mpl/aux_/preprocessed/msvc60/list.hpp"
        "/usr/include/boost/mpl/aux_/preprocessed/msvc70/list.hpp"
        "/usr/include/boost/mpl/aux_/preprocessed/no_ctps/list.hpp"
  3 files with hash 7a43c97436ae1913 and size 14547
        "/usr/include/boost/mpl/aux_/preprocessed/msvc60/set.hpp"
        "/usr/include/boost/mpl/aux_/preprocessed/msvc70/set.hpp"
        "/usr/include/boost/mpl/aux_/preprocessed/no_ctps/set.hpp"
...

Elixir

Translation of: Ruby
defmodule Files do
  def find_duplicate_files(dir) do
    IO.puts "\nDirectory : #{dir}"
    File.cd!(dir, fn ->
      Enum.filter(File.ls!, fn fname -> File.regular?(fname) end)
      |> Enum.group_by(fn file -> File.stat!(file).size end)
      |> Enum.filter(fn {_, files} -> length(files)>1 end)
      |> Enum.each(fn {size, files} ->
           Enum.group_by(files, fn file -> :erlang.md5(File.read!(file)) end)
           |> Enum.filter(fn {_, files} -> length(files)>1 end)
           |> Enum.each(fn {_md5, fs} ->
                IO.puts "  --------------------------------------------"
                Enum.each(fs, fn file ->
                  IO.puts "  #{inspect File.stat!(file).mtime}\t#{size}  #{file}"
                end)
              end)
         end)
    end)
  end
end

hd(System.argv) |> Files.find_duplicate_files
Output:
C:\Elixir>elixir find_dup_file.exs \Windows\System32

Directory : \Windows\System32
  --------------------------------------------
  {{2009, 7, 14}, {1, 0, 32}}   31548  perfd009.dat
  {{2010, 11, 21}, {7, 14, 4}}  31548  perfd011.dat
  --------------------------------------------
  {{2015, 4, 29}, {18, 21, 50}} 5120  msdxm.ocx
  {{2015, 4, 29}, {18, 21, 50}} 5120  dxmasf.dll
  --------------------------------------------
  {{2010, 11, 21}, {3, 23, 55}} 91648  mapi32.dll
  {{2010, 11, 21}, {3, 23, 55}} 91648  mapistub.dll
  --------------------------------------------
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcp110_clr0400.dll
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcr100_clr0400.dll
  {{2014, 4, 11}, {13, 39, 56}} 18088  msvcr110_clr0400.dll

Go

In theory this should work on any of the operating systems supported by Go (Linux, macOS, Windows, OpenBSD etc.) though only tested on Ubuntu 16.04.

package main

import (
    "fmt"
    "crypto/md5"
    "io/ioutil"
    "log"
    "os"
    "path/filepath"
    "sort"
    "time"
)

type fileData struct {
    filePath string
    info     os.FileInfo
}

type hash [16]byte

func check(err error) {
    if err != nil {
        log.Fatal(err)
    }
}

func checksum(filePath string) hash {
    bytes, err := ioutil.ReadFile(filePath)
    check(err)
    return hash(md5.Sum(bytes))
}

func findDuplicates(dirPath string, minSize int64) [][2]fileData {
    var dups [][2]fileData
    m := make(map[hash]fileData)
    werr := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
        if err != nil {
            return err
        }
        if !info.IsDir() && info.Size() >= minSize {
            h := checksum(path)
            fd, ok := m[h]
            fd2 := fileData{path, info}
            if !ok {
                m[h] = fd2
            } else {
                dups = append(dups, [2]fileData{fd, fd2})
            }
        }
        return nil
    })
    check(werr)
    return dups
}

func main() {
    dups := findDuplicates(".", 1)
    fmt.Println("The following pairs of files have the same size and the same hash:\n")
    fmt.Println("File name                 Size      Date last modified")
    fmt.Println("==========================================================")
    sort.Slice(dups, func(i, j int) bool {
        return dups[i][0].info.Size() > dups[j][0].info.Size() // in order of decreasing size
    })
    for _, dup := range dups {
        for i := 0; i < 2; i++ {
            d := dup[i]
            fmt.Printf("%-20s  %8d    %v\n", d.filePath, d.info.Size(), d.info.ModTime().Format(time.ANSIC))
        }
        fmt.Println()
    }
}
Output:

Sample output:

The following pairs of files have the same size and the same hash:

File name                 Size      Date last modified
==========================================================
vib.gif                 689113    Wed Sep 26 16:33:34 2018
vibrating.gif           689113    Tue Oct  2 00:38:08 2018

analysis2.txt             6155    Thu Sep 13 12:19:06 2018
temp/analysis3.txt        6155    Fri Dec 28 15:20:54 2018

w_pinstripe.png           2994    Tue Sep 25 12:18:05 2018
wb_pinstripe.png          2994    Tue Sep 25 12:06:53 2018

sox.txt                     63    Sat Dec 22 21:59:23 2018
sox2.txt                    63    Fri Dec 28 12:19:02 2018

Haskell

- checks for wrong command line input (not existing directory / negative size)
- works on Windows as well as Unix Systems (tested with Mint 17 / Windows 7)
import Crypto.Hash.MD5        (hash)
import Data.ByteString as BS  (readFile, ByteString())
import System.Environment     (getArgs, getProgName)
import System.Directory       (doesDirectoryExist, getDirectoryContents)
import System.FilePath.Posix  ((</>))
import Control.Monad          (forM)
import Text.Printf            (printf)
import System.IO              (withFile, IOMode(ReadMode), hFileSize)


type File = (BS.ByteString, -- md5hash
             FilePath)      -- filepath

type FileSize = Integer

getRecursiveContents :: FilePath -> FileSize -> IO [File]
getRecursiveContents curDir maxsize = do
  names <- getDirectoryContents curDir
  let dirs = filter (`notElem` [".", ".."]) names
  files <- forM dirs $ \path -> do
             let path' = curDir </> path
             exists <- doesDirectoryExist path'
             if exists
                then getRecursiveContents path' maxsize
                else genFileHash path' maxsize
  return $ concat files


genFileHash :: FilePath -> FileSize -> IO [File]
genFileHash path maxsize = do
  size <- withFile path ReadMode hFileSize
  if size <= maxsize
    then BS.readFile path >>= \bs -> return [(hash bs, path)]
    else return []

findDuplicates :: FilePath -> FileSize -> IO ()
findDuplicates dir bytes = do
  exists <- doesDirectoryExist dir
  if exists
    then getRecursiveContents dir bytes >>= findSameHashes
    else printf "Sorry, the directory \"%s\" does not exist...\n" dir

findSameHashes :: [File] -> IO ()
findSameHashes []     = return ()
findSameHashes ((hash, fp):xs) = do
  case lookup hash xs of
    (Just dupFile) -> printf "===========================\n\
                            \Found duplicate:\n\
                            \=> %s \n\
                            \=> %s \n\n" fp dupFile
                      >> findSameHashes xs
    (_)            -> findSameHashes xs

main :: IO ()
main = do
  args <- getArgs
  case args of
    [dir, mbytes] | [(bytes ,"")] <- reads mbytes
                   , bytes >= 1 -> findDuplicates dir bytes
    (_) -> do
      name <- getProgName
      printf "Something went wrong - please use ./%s <dir> <bytes>\n" name

Example output:

$./finddups ~/Documents/MyGit/Haskell/ 20000
===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/.git/logs/HEAD 
=> /home/rewrite/Documents/MyGit/Haskell/.git/logs/refs/heads/master 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/.git/refs/remotes/origin/master 
=> /home/rewrite/Documents/MyGit/Haskell/.git/refs/heads/master 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/RosettaCode/Find-duplicate-files/sampletext.txt 
=> /home/rewrite/Documents/MyGit/Haskell/RosettaCode/otherdup.txt 

===========================
Found duplicate:
=> /home/rewrite/Documents/MyGit/Haskell/RWH/FileManipulation/toupper-imp.hs 
=> /home/rewrite/Documents/MyGit/Haskell/RWH/FileManipulation/toupper-imp.hs~ 


$./finddups /home/rewrite/NotExistingDir 200000
Sorry, the directory "/home/rewrite/NotExistingDir" does not exist...


$./finddups /home/rewrite/ -100
Something went wrong - please use ./finddups <dir> <bytes>

Java

This should work on with any OS or filesystem supported by Java. Hard links are indicated by displaying the files on the same line separated by "=". MD5 checksums are used to detect duplicate files.

import java.io.*;
import java.nio.*;
import java.nio.file.*;
import java.nio.file.attribute.*;
import java.security.*;
import java.util.*;

public class DuplicateFiles {
    public static void main(String[] args) {
        if (args.length != 2) {
            System.err.println("Directory name and minimum file size are required.");
            System.exit(1);
        }
        try {
            findDuplicateFiles(args[0], Long.parseLong(args[1]));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static void findDuplicateFiles(String directory, long minimumSize)
        throws IOException, NoSuchAlgorithmException {
        System.out.println("Directory: '" + directory + "', minimum size: " + minimumSize + " bytes.");
        Path path = FileSystems.getDefault().getPath(directory);
        FileVisitor visitor = new FileVisitor(path, minimumSize);
        Files.walkFileTree(path, visitor);
        System.out.println("The following sets of files have the same size and checksum:");
        for (Map.Entry<FileKey, Map<Object, List<String>>> e : visitor.fileMap_.entrySet()) {
            Map<Object, List<String>> map = e.getValue();
            if (!containsDuplicates(map))
                continue;
            List<List<String>> fileSets = new ArrayList<>(map.values());
            for (List<String> files : fileSets)
                Collections.sort(files);
            Collections.sort(fileSets, new StringListComparator());
            FileKey key = e.getKey();
            System.out.println();
            System.out.println("Size: " + key.size_ + " bytes");
            for (List<String> files : fileSets) {
                for (int i = 0, n = files.size(); i < n; ++i) {
                    if (i > 0)
                        System.out.print(" = ");
                    System.out.print(files.get(i));
                }
                System.out.println();
            }
        }
    }

    private static class StringListComparator implements Comparator<List<String>> {
        public int compare(List<String> a, List<String> b) {
            int len1 = a.size(), len2 = b.size();
            for (int i = 0; i < len1 && i < len2; ++i) {
                int c = a.get(i).compareTo(b.get(i));
                if (c != 0)
                    return c;
            }
            return Integer.compare(len1, len2);
        }
    }

    private static boolean containsDuplicates(Map<Object, List<String>> map) {
        if (map.size() > 1)
            return true;
        for (List<String> files : map.values()) {
            if (files.size() > 1)
                return true;
        }
        return false;
    }

    private static class FileVisitor extends SimpleFileVisitor<Path> {
        private MessageDigest digest_;
        private Path directory_;
        private long minimumSize_;
        private Map<FileKey, Map<Object, List<String>>> fileMap_ = new TreeMap<>();

        private FileVisitor(Path directory, long minimumSize) throws NoSuchAlgorithmException {
            directory_ = directory;
            minimumSize_ = minimumSize;
            digest_ = MessageDigest.getInstance("MD5");
        }

        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
            if (attrs.size() >= minimumSize_) {
                FileKey key = new FileKey(file, attrs, getMD5Sum(file));
                Map<Object, List<String>> map = fileMap_.get(key);
                if (map == null)
                    fileMap_.put(key, map = new HashMap<>());
                List<String> files = map.get(attrs.fileKey());
                if (files == null)
                    map.put(attrs.fileKey(), files = new ArrayList<>());
                Path relative = directory_.relativize(file);
                files.add(relative.toString());
            }
            return FileVisitResult.CONTINUE;
        }

        private byte[] getMD5Sum(Path file) throws IOException {
            digest_.reset();
            try (InputStream in = new FileInputStream(file.toString())) {
                byte[] buffer = new byte[8192];
                int bytes;
                while ((bytes = in.read(buffer)) != -1) {
                    digest_.update(buffer, 0, bytes);
                }
            }
            return digest_.digest();
        }
    }

    private static class FileKey implements Comparable<FileKey> {
        private byte[] hash_;
        private long size_;

        private FileKey(Path file, BasicFileAttributes attrs, byte[] hash) throws IOException {
            size_ = attrs.size();
            hash_ = hash;
        }

        public int compareTo(FileKey other) {
            int c = Long.compare(other.size_, size_);
            if (c == 0)
                c = hashCompare(hash_, other.hash_);
            return c;
        }
    }

    private static int hashCompare(byte[] a, byte[] b) {
        int len1 = a.length, len2 = b.length;
        for (int i = 0; i < len1 && i < len2; ++i) {
            int c = Byte.compare(a[i], b[i]);
            if (c != 0)
                return c;
        }
        return Integer.compare(len1, len2);
    }
}
Output:
Directory: 'test', minimum size: 1000 bytes.
The following sets of files have the same size and checksum:

Size: 16370 bytes
file2
file4
file5

Size: 8188 bytes
file1 = file3
file6

Julia

Works with: Julia version 0.6 and higher

This solution uses Nettle package for MD5 hashing. Should work on Windows, macOS and Linux.

using Printf, Nettle

function find_duplicates(path::String, minsize::Int = 0)
    filesdict = Dict{String,Array{NamedTuple}}()

    for (root, dirs, files) in walkdir(path), fn in files
        filepath = joinpath(root, fn)
        filestats = stat(filepath)

        filestats.size > minsize || continue

        hash = open(f -> hexdigest("md5", read(f)), filepath)

        if haskey(filesdict, hash)
            push!(filesdict[hash], (path = filepath, stats = filestats))
        else
            filesdict[hash] = [(path = filepath, stats = filestats)]
        end
    end

    # Get duplicates
    dups = [tups for tups in values(filesdict) if length(tups) > 1]

    return dups

end

function main()
    path = "."
    println("Finding duplicates in \"$path\"")
    dups = find_duplicates(".", 1)

    println("The following group of files have the same size and the same hash:\n")
    println("File name                                       Size   last modified")
    println("="^76)

    for files in sort(dups, by = tups -> tups[1].stats.size, rev = true)
        for (path, stats) in sort(files, by = tup -> tup.path, rev = true)
            @printf("%-44s%8d   %s\n", path, stats.size, Libc.strftime(stats.mtime))
        end
        println()
    end
end

main()
Output:

Sample output:

Finding duplicates in "."
The following group of files have the same size and the same hash:

File name                                       Size   last modified
============================================================================
.\TestExamples\audio_file.mp3                8945229   11/29/2019 7:39:50 PM
.\TestExamples\audio_file-copy.mp3           8945229   11/30/2019 1:03:09 PM

.\TestExamples\doc_file.doc                   503296   11/29/2019 7:39:49 PM
.\TestExamples\doc_file-txt_copy.txt          503296   11/29/2019 7:40:05 PM
.\TestExamples\doc_file-copy.doc              503296   11/29/2019 7:39:49 PM

.\TestExamples\text_file.txt                      27   11/29/2019 7:37:12 PM
.\TestExamples\text_file-copy.txt                 27   11/29/2019 7:37:12 PM

Mathematica/Wolfram Language

hash="SHA256";
minSize=Quantity[1,"Megabytes"];
allfiles=Once@Select[FileNames["*","",],!Once@DirectoryQ[#]&&Once@FileSize[#]>minSize&];
data={#,Once[FileHash[#,hash,All,"HexString"]]}&/@allfiles[[;;5]];
Grid[Select[GatherBy[data,Last],Length[#]>1&][[All,All,1]]]
Sample output:

sample directory:

someFile   eebe4df6d2951e77973b83af039f6565b215f74113028bbc5d8f96b856947abe
someFile2   3e6be6db0858c18573af3fde8308fa9759209079e2e372e21ebd6d3c8512d09e
someFile3   bef0039c33277f743b60b0076871110b96e14de34045aafc8e764349de6043b5
directory\someFile  eebe4df6d2951e77973b83af039f6565b215f74113028bbc5d8f96b856947abe
directory\someFile4 e6385b50ec8b052b141588573f680261db714babe534d8ced8a17985b14f58e9

sample output:

35 MB  {someFile,directory\someFile}

Nim

Our solution works on Linux and likely on any Posix system. To mark hard links, we provide the inode number of the file. Two paths with the same inode number are in fact two links to the same file. To make them more visible, an asterisk is used.

The detection of hard links may be OS dependent and may not work on Windows.

import algorithm
import os
import strformat
import strutils
import tables
import std/sha1
import times

type

  # Mapping "size" -> "list of paths".
  PathsFromSizes = Table[BiggestInt, seq[string]]

  # Mapping "hash" -> "list fo paths".
  PathsFromHashes = Table[string, seq[string]]

  # Information data.
  Info = tuple[size: BiggestInt; paths: seq[string]]


#---------------------------------------------------------------------------------------------------

proc processCmdLine(): tuple[dirpath: string; minsize: Natural] =
  ## Process the command line. Extra parameters are ignored.

  if paramCount() == 0:
    quit fmt"Usage: {getAppFileName().splitPath()[1]} folder minsize"

  result.dirpath = paramStr(1)
  if not result.dirpath.dirExists():
    quit fmt"Wrong directory path: {result.dirpath}"

  if paramCount() >= 2:
    try:
      result.minsize = parseInt(paramStr(2))
    except ValueError:
      quit fmt"Wrong minimum size: {paramStr(2)}"

#---------------------------------------------------------------------------------------------------

proc initPathsFromSize(dirpath: string; minsize: Natural): PathsFromSizes =
  ## Retrieve the files in directory "dirpath" with minimal size "minsize"
  ## and build the mapping from size to paths.

  for path in dirpath.walkDirRec():
    if not path.fileExists():
      continue    # Not a regular file.
    let size = path.getFileSize()
    if size >= minSize:
      # Store path in "size to paths" table.
      result.mgetOrPut(size, @[]).add(path)

#---------------------------------------------------------------------------------------------------

proc initPathsFromHashes(pathsFromSizes: PathsFromSizes): PathsFromHashes =
  ## Compute hashes for files whose size is not unique and build the mapping
  ## from hash to paths.

  for size, paths in pathsFromSizes.pairs:
    if paths.len > 1:
      for path in paths:
        # Store path in "digest to paths" table.
        result.mgetOrPut($path.secureHashFile(), @[]).add(path)

#---------------------------------------------------------------------------------------------------

proc cmp(x, y: Info): int =
  ## Compare two information tuples. Used to sort the list of duplicates files.

  result = cmp(x.size, y.size)
  if result == 0:
    # Same size. Compare the first paths (we are sure that they are different).
    result = cmp(x.paths[0], y.paths[0])

#---------------------------------------------------------------------------------------------------

proc displayDuplicates(dirpath: string; pathsFromHashes: PathsFromHashes) =
  ## Display duplicates files in directory "dirpath".

  echo "Files with same size and same SHA1 hash value in directory: ", dirpath
  echo ""

  # Build list of duplicates.
  var duplicates: seq[Info]
  for paths in pathsFromHashes.values:
    if paths.len > 1:
      duplicates.add((paths[0].getFileSize(), sorted(paths)))
  if duplicates.len == 0:
    echo "No files"
    return
  duplicates.sort(cmp, Descending)

  # Display duplicates.
  echo fmt"""{"Size":>10}     {"Last date modified":^19}   {"Inode":>8}    HL    File name"""
  echo repeat('=', 80)
  for (size, paths) in duplicates:
    echo ""
    for path in paths:
      let mtime = path.getLastModificationTime().format("YYYY-MM-dd HH:mm:ss")
      let info = path.getFileInfo()
      let inode = info.id.file
      let hardlink = if info.linkCount == 1: " " else: "*"
      echo fmt"{size:>10} {mtime:>23} {inode:>12}  {hardlink:<5} {path.relativePath(dirpath)}"


#———————————————————————————————————————————————————————————————————————————————————————————————————

let (dirpath, minsize) = processCmdLine()
let pathsFromSizes = initPathsFromSize(dirpath, minsize)
let pathsFromHashes = initPathsFromHashes(pathsFromSizes)
dirpath.displayDuplicates(pathsFromHashes)
Output:
Files with same size and same SHA1 hash value in directory: .

      Size     Last date modified       Inode    HL    File name
================================================================================

    499515     2020-12-10 22:48:06     12981503        subdir/tree.ppm
    499515     2020-12-10 22:45:26     12722201  *     subdir/tree1.ppm
    499515     2020-12-10 22:45:26     12722201  *     tree.ppm
    499515     2020-12-10 22:47:51     12722205        tree1.ppm

     65322     2020-12-10 22:44:53     12722178  *     house.jpg
     65322     2020-12-10 22:44:53     12722178  *     house1.jpeg

      6401     2020-12-10 22:45:07     12722182        dragon.png
      6401     2020-12-10 22:45:53     12722204        dragon1.png
      6401     2020-12-10 22:46:21     12981502        subdir/dragon.png

Objeck

Solution works on Windows, macOS and Linux.

use System.IO.File;
use System.Time;
use Collection;

class Duplicate {
  function : Main(args : String[]) ~ Nil {
    if(args->Size() = 2) {
      file_sets := SortDups(GetDups(args[0], args[1]->ToInt()));
      each(i : file_sets) {
        file_set := file_sets->Get(i)->As(Vector);
        if(file_set->Size() > 1) {
          "Duplicates:"->PrintLine();
          "----"->PrintLine();
          each(j : file_set) {
            file_set->Get(j)->As(FileMeta)->ToString()->PrintLine();
          };
        };
        '\n'->Print();
      };
    };
  }

  function : SortDups(unsorted : Vector) ~ Vector {
    sorted := IntMap->New();

    each(i : unsorted) {
      value := unsorted->Get(i)->As(Vector);
      key := value->Get(0)->As(FileMeta)->GetSize();
      sorted->Insert(key, value);
    };

    return sorted->GetValues();
  }

  function : GetDups(dir : String, size : Int) ~ Vector {
    duplicates := StringMap->New();

    files := Directory->List(dir);
    each(i : files) {
      file_name := String->New(dir);
      file_name += '/';
      file_name += files[i];

      file_size := File->Size(file_name);
      if(file_size >= size) {
        file_date := File->ModifiedTime(file_name);
        file_hash := file_size->ToString();
        file_hash += ':';
        file_hash += Encryption.Hash->MD5(FileReader->ReadBinaryFile(file_name))->ToString();
        file_meta := FileMeta->New(file_name, file_size, file_date, file_hash);

        file_set := duplicates->Find(file_hash)->As(Vector);
        if(file_set = Nil) {
          file_set := Vector->New();
          duplicates->Insert(file_hash, file_set);
        };
        file_set->AddBack(file_meta);
      };
    };

    return duplicates->GetValues();
  }
}

class FileMeta {
  @name : String;
  @size : Int;
  @date : Date;
  @hash : String;

  New(name : String, size : Int, date : Date, hash : String) {
    @name := name;
    @size := size;
    @date := date;
    @hash := hash;
  }

  method : public : GetSize() ~ Int {
    return @size;
  }

  method : public : ToString() ~ String {
    date_str := @date->ToShortString();
    return "{$@name}, {$@size}, {$date_str}";
  }
}
Output:
$ obr duplicate.obe /tmp/foo 4000
Duplicates:
----
/tmp/foo/bb.obe, 19822, 3/29/2019 8:07:21 PM
/tmp/foo/aa.obe, 19822, 3/29/2019 8:07:17 PM

Duplicates:
----
/tmp/foo/hh.obe, 20020, 3/29/2019 8:47:43 PM
/tmp/foo/gg.obe, 20020, 3/29/2019 8:47:37 PM
/tmp/foo/ee.obe, 20020, 3/29/2019 8:47:33 PM
/tmp/foo/dd.obe, 20020, 3/29/2019 8:47:14 PM

OCaml

Although this solution uses the Unix module, it only calls lstat from there, which exists in the Windows port of the Unix module and so should be portable.

let readdir_or_empty dir =
  try Sys.readdir dir
  with Sys_error e ->
    prerr_endline ("Could not read dir " ^ dir ^ ": " ^ e);
    [||]

let directory_walk root func =
  let rec aux dir =
    readdir_or_empty dir
    |> Array.iter (fun filename ->
           let path = Filename.concat dir filename in
           let open Unix in
           let stat = lstat path in
           match stat.st_kind with
           | S_DIR -> aux path
           | S_REG -> func path stat
           | _ -> ())
  in
  aux root

let rec input_retry ic buf pos len =
  let count = input ic buf pos len in
  if count = 0 || count = len then count + pos
  else input_retry ic buf (pos + count) (len - count)

let with_file_in_bin fn f =
  let fh = open_in_bin fn in
  Fun.protect ~finally:(fun () -> close_in fh) (fun () -> f fh)

let is_really_same_file fn1 fn2 =
  with_file_in_bin fn1 (fun fh1 ->
      with_file_in_bin fn2 (fun fh2 ->
          let len = 2048 in
          let buf1 = Bytes.create len in
          let buf2 = Bytes.create len in
          let rec aux () =
            let read1 = input_retry fh1 buf1 0 len in
            let read2 = input_retry fh2 buf2 0 len in
            if read1 <> read2 || buf1 <> buf2 then false
            else if read1 = 0 then true
            else aux ()
          in
          aux ()))

let () =
  let tbl = Hashtbl.create 128 in
  let seen = Hashtbl.create 128 in
  let min_size = int_of_string Sys.argv.(2) in
  directory_walk Sys.argv.(1) (fun path stat ->
      try
        let identity_tuple = (stat.st_dev, stat.st_ino) in
        match Hashtbl.find_opt seen identity_tuple with
        | Some existing ->
            print_endline
              ("File " ^ existing ^ " is the same hard link as " ^ path)
        | None -> (
            Hashtbl.add seen identity_tuple path;
            let size = stat.st_size in
            if size >= min_size then
              let digest = Digest.file path in
              Hashtbl.find_all tbl digest
              |> List.find_opt (is_really_same_file path)
              |> function
              | Some existing ->
                  print_endline ("File " ^ existing ^ " matches " ^ path)
              | None -> Hashtbl.add tbl digest path)
      with Sys_error e -> prerr_endline ("Could not hash " ^ path ^ ": " ^ e))
Output:
$ dune build @fmt
$ dune exec ./finddupes.exe . 1024
File ./finddupes.ml matches ./_build/default/finddupes.ml
File ./finddupes.ml matches ./_build/default/.formatted/finddupes.ml

Perl

For supplied directory, compare all files, recursing into sub-directories. By default, showing duplicate files of 1 byte or larger, configurable with command-line option. Using CPAN File modules for enhanced portability.

use File::Find qw(find);
use File::Compare qw(compare);
use Sort::Naturally;
use Getopt::Std qw(getopts);

my %opts;
$opts{s} = 1;
getopts("s:", \%opts);

sub find_dups {
    my($dir) = @_;

    my @results;
    my %files;
    find {
        no_chdir => 1,
        wanted => sub { lstat; -f _ && (-s >= $opt{s} ) && push @{$files{-s _}}, $_ }
    } => $dir;

    foreach my $files (values %files) {
        next unless @$files;

        my %dups;
        foreach my $a (0 .. @$files - 1) {
            for (my $b = $a + 1 ; $b < @$files ; $b++) {
                next if compare(@$files[$a], @$files[$b]);
                push @{$dups{ @$files[$a] }}, splice @$files, $b--, 1;
            }
        }

        while (my ($original, $clones) = each %dups) {
            push @results, sprintf "%8d %s\n", (stat($original))[7], join ', ', sort $original, @$clones;
        }
    }
    reverse nsort @results;

}

print for find_dups(@ARGV);
Output:
     372 aaa.txt, dir2/aaa.txt
      29 bbb.txt, dir1/bbb.txt

Phix

Works on Windows and Linux. No handling of hard (or soft) links.

without js -- file i/o
integer min_size=1
sequence res = {}
atom t1 = time()+1
 
function store_res(string filepath, sequence dir_entry)
    if not match("backup",filepath) -- (example filter)
    and not find('d', dir_entry[D_ATTRIBUTES]) then
        atom size = dir_entry[D_SIZE]
        if size>=min_size then
            res = append(res,{size,filepath,dir_entry})
            if time()>t1 then
                printf(1,"%d files found\r",length(res))
                t1 = time()+1
            end if
        end if
    end if
    return 0 -- keep going
end function
integer exit_code = walk_dir("demo\\clocks\\love", store_res, true)
 
res = sort(res,DESCENDING)
printf(1,"%d files found\n",length(res))
 
integer duplicates = 0
for i=1 to length(res)-1 do
    for j=i+1 to length(res) do
        if res[i][1]!=res[j][1] then exit end if
        string si = join_path({res[i][2],res[i][3][D_NAME]}),
               sj = join_path({res[j][2],res[j][3][D_NAME]})
        integer fni = open(si,"rb"),
                fnj = open(sj,"rb"),
                size = res[i][1]
        bool same = true
        if fni=-1 or fnj=-1 then ?9/0 end if
        for k=1 to size+1 do    -- (check eof as well)
            if getc(fni)!=getc(fnj) then
                same = false
                exit
            end if
        end for
        close(fni)
        close(fnj)
        if same then
            -- prettifying the output left as an exercise...
            ?res[i]
            ?res[j]
            duplicates += 1
        end if
    end for
    if time()>t1 then
        printf(1,"processing %d/%d...\r",{i,length(res)})
        t1 = time()+1
    end if
end for
printf(1,"%d duplicates found\n",duplicates)
Output:
136 files found
{2996224,"demo\\clocks\\love\\love-0.9.1-win32",{"love.dll","",2996224,2014,4,1,19,54,33}}
{2996224,"demo\\clocks\\love\\Chemical Me",{"love.dll","a",2996224,2014,4,1,19,54,32}}
{1059840,"demo\\clocks\\love\\love-0.9.1-win32",{"DevIL.dll","",1059840,2014,4,1,19,53,31}}
{1059840,"demo\\clocks\\love\\Chemical Me",{"DevIL.dll","a",1059840,2014,4,1,19,53,30}}
{875472,"demo\\clocks\\love\\love-0.9.1-win32",{"msvcr110.dll","",875472,2012,11,6,0,20,52}}
{875472,"demo\\clocks\\love\\Chemical Me",{"msvcr110.dll","a",875472,2012,11,6,0,20,52}}
{774656,"demo\\clocks\\love\\love-0.9.1-win32",{"SDL2.dll","",774656,2014,4,1,19,53,36}}
{774656,"demo\\clocks\\love\\Chemical Me",{"SDL2.dll","a",774656,2014,4,1,19,53,36}}
{535008,"demo\\clocks\\love\\love-0.9.1-win32",{"msvcp110.dll","",535008,2012,11,6,0,20,52}}
{535008,"demo\\clocks\\love\\Chemical Me",{"msvcp110.dll","a",535008,2012,11,6,0,20,52}}
{349184,"demo\\clocks\\love\\love-0.9.1-win32",{"OpenAL32.dll","",349184,2014,4,1,19,53,33}}
{349184,"demo\\clocks\\love\\Chemical Me",{"OpenAL32.dll","a",349184,2014,4,1,19,53,32}}
{347648,"demo\\clocks\\love\\love-0.9.1-win32",{"lua51.dll","",347648,2014,4,1,19,53,49}}
{347648,"demo\\clocks\\love\\Chemical Me",{"lua51.dll","a",347648,2014,4,1,19,53,48}}
{139264,"demo\\clocks\\love\\love-0.9.1-win32",{"mpg123.dll","",139264,2014,4,1,19,53,52}}
{139264,"demo\\clocks\\love\\Chemical Me",{"mpg123.dll","a",139264,2014,4,1,19,53,52}}
8 duplicates found

PicoLisp

File duplicates in /bin dir on Void Linux. Hash provided by xxhash library via mmap.

`(== 64 64)
(de mmap (L F)
   (native "@" "mmap" 'N 0 L 1 2 F 0) )
(de munmap (A L)
   (native "@" "munmap" 'N A L) )
(de xxh64 (M S)
   (let
      (R (native "libxxhash.so" "XXH64" 'N M S 0)
         P `(** 2 64) )
      (if (lt0 R)
         (& (+ R P) (dec P))
         R ) ) )
(de walk (Dir)
   (recur (Dir)
      (for F (dir Dir)
         (let (Path (pack Dir "/" F)  Info (info Path T))
            (when (car Info)
               (if (=T (car Info))
                  (recurse Path)
                  (if (lup D (car Info))
                     (push (cdr @) Path)
                     (idx 'D (list (car Info) (cons Path)) T) ) ) ) ) ) ) )
(off D)
(walk "/bin")
(for Lst (filter cdadr (idx 'D))
   (let L
      (by
         '((F)
            (let (M (mmap (car Lst) (open F T))
               S (car Lst) )
               (prog1 (xxh64 M S) (munmap M S)) ) )
         group
         (cadr Lst) )
      (and (filter cdr L) (println (car Lst) @)) ) )
Output:
1045 (("/bin/envvars-std" "/bin/envvars"))
1246 (("/bin/pdftexi2dvi" "/bin/texi2pdf"))
2346 (("/bin/gunzip" "/bin/uncompress"))
5719 (("/bin/roff2dvi" "/bin/roff2html" "/bin/roff2pdf" "/bin/roff2ps" "/bin/roff2text" "/bin/roff2x"))
35384 (("/bin/gcc-ar" "/bin/x86_64-unknown-linux-gnu-gcc-ar") ("/bin/gcc-nm" "/bin/x86_64-unknown-linux-gnu-gcc-nm"))
35392 (("/bin/gcc-ranlib" "/bin/x86_64-unknown-linux-gnu-gcc-ranlib"))
36478 (("/bin/aclocal-1.16" "/bin/aclocal"))
45800 (("/bin/perlthanks" "/bin/perlbug"))
178384 (("/bin/unzip" "/bin/zipinfo"))
257949 (("/bin/automake" "/bin/automake-1.16"))
512640 (("/bin/makewhatis" "/bin/mandoc"))
645464 (("/bin/gawk-5.0.1" "/bin/gawk"))
865760 (("/bin/zsh" "/bin/zsh-5.8"))
1129008 (("/bin/gcc" "/bin/x86_64-unknown-linux-gnu-gcc" "/bin/x86_64-unknown-linux-gnu-gcc-9.3.0"))
1133104 (("/bin/c++" "/bin/g++" "/bin/x86_64-unknown-linux-gnu-c++" "/bin/x86_64-unknown-linux-gnu-g++"))

Python

from __future__ import print_function
import os
import hashlib
import datetime

def FindDuplicateFiles(pth, minSize = 0, hashName = "md5"):
    knownFiles = {}

    #Analyse files
    for root, dirs, files in os.walk(pth):
        for fina in files:
            fullFina = os.path.join(root, fina)
            isSymLink = os.path.islink(fullFina)
            if isSymLink:
                continue # Skip symlinks
            si = os.path.getsize(fullFina)
            if si < minSize:
                continue
            if si not in knownFiles:
                knownFiles[si] = {}
            h = hashlib.new(hashName)
            h.update(open(fullFina, "rb").read())
            hashed = h.digest()
            if hashed in knownFiles[si]:
                fileRec = knownFiles[si][hashed]
                fileRec.append(fullFina)
            else:
                knownFiles[si][hashed] = [fullFina]

    #Print result
    sizeList = list(knownFiles.keys())
    sizeList.sort(reverse=True)
    for si in sizeList:
        filesAtThisSize = knownFiles[si]
        for hashVal in filesAtThisSize:
            if len(filesAtThisSize[hashVal]) < 2:
                continue
            fullFinaLi = filesAtThisSize[hashVal]
            print ("=======Duplicate=======")
            for fullFina in fullFinaLi:
                st = os.stat(fullFina)
                isHardLink = st.st_nlink > 1 
                infoStr = []
                if isHardLink:
                    infoStr.append("(Hard linked)")
                fmtModTime = datetime.datetime.utcfromtimestamp(st.st_mtime).strftime('%Y-%m-%dT%H:%M:%SZ')
                print (fmtModTime, si, os.path.relpath(fullFina, pth), " ".join(infoStr))

if __name__=="__main__":

    FindDuplicateFiles('/home/tim/Dropbox', 1024*1024)

Racket

#lang racket

(struct F (name id size [links #:mutable]))

(require openssl/sha1)
(define (find-duplicate-files path size)
  (define Fs
    (sort
     (fold-files
      (λ(path type acc)
        (define s (and (eq? 'file type) (file-size path)))
        (define i (and s (<= size s) (file-or-directory-identity path)))
        (define ln (and i (findf (λ(x) (equal? i (F-id x))) acc)))
        (when ln (set-F-links! ln (cons (path->string path) (F-links ln))))
        (if (and i (not ln)) (cons (F path i s '()) acc) acc))
      '() path #f)
     > #:key F-size))
  (define (find-duplicates Fs)
    (define t (make-hash))
    (for ([F Fs])
      (define cksum (call-with-input-file (F-name F) sha1))
      (hash-set! t cksum (cons F (hash-ref t cksum '()))))
    (for/list ([(n Fs) (in-hash t)] #:unless (null? (cdr Fs))) Fs))
  (let loop ([Fs Fs])
    (if (null? Fs) '()
        (let-values ([(Fs Rs)
                      (splitf-at Fs (λ(F) (= (F-size F) (F-size (car Fs)))))])
          (append (find-duplicates Fs)
                  (loop Rs))))))

(define (show-duplicates path size)
  (for ([Fs (find-duplicate-files path size)])
    (define (links F)
      (if (null? (F-links F)) ""
          (format " also linked at ~a" (string-join (F-links F) ", "))))
    (printf "~a (~a)~a\n" (F-name (car Fs)) (F-size (car Fs)) (links (car Fs)))
    (for ([F (cdr Fs)]) (printf "  ~a~a\n" (F-name F) (links F)))))

(show-duplicates (find-system-path 'home-dir) 1024)

Raku

(formerly Perl 6)

This implementation takes a starting directory (defaults to the current directory) and has a few flags to set behaviour: --minsize, minimum file size to look at, defaults to 5 bytes; and --recurse, recurse into the directory structure, default True. It finds files of the same size, calculates hashes to compare, then reports files that hash the same.

use Digest::SHA256::Native;

sub MAIN( $dir = '.', :$minsize = 5, :$recurse = True ) {
    my %files;
    my @dirs = $dir.IO.absolute.IO;
    while @dirs {
        my @files = @dirs.pop;
        while @files {
            for @files.pop.dir -> $path {
                %files{ $path.s }.push: $path if $path.f and $path.s >= $minsize;
                @dirs.push: $path if $path.d and $path.r and $recurse
            }
        }
    }

    for %files.sort( +*.key ).grep( *.value.elems > 1)».kv -> ($size, @list) {
        my %dups;
        @list.map: { %dups{ sha256-hex( ($_.slurp :bin).decode ) }.push: $_.Str };
        for %dups.grep( *.value.elems > 1)».value -> @dups {
            say sprintf("%9s : ", scale $size ),  @dups.join(', ');
        }
    }
}

sub scale ($bytes) {
    given $bytes {
        when $_ < 2**10 {  $bytes                    ~ ' B'  }
        when $_ < 2**20 { ($bytes / 2**10).round(.1) ~ ' KB' }
        when $_ < 2**30 { ($bytes / 2**20).round(.1) ~ ' MB' }
        default         { ($bytes / 2**30).round(.1) ~ ' GB' }
    }
}
Sample output:

Passing in command line switches: --minsize=0 --recurse=False /home/me/p6

     0 B : /home/me/p6/vor.ppm, /home/me/p6/ns.txt
   190 B : /home/me/p6/scrub(copy).t, /home/me/p6/scrub.t
  1.3 KB : /home/me/p6/coco.p6, /home/me/p6/coc.p6
 80.5 KB : /home/me/p6/temp.txt, /home/me/p6/temp.html
279.6 KB : /home/me/p6/pentaflake.svg, /home/me/p6/5nflake.svg

REXX

bare bones version

This REXX version works with DOS   (with or without Microsoft Windows).
Note that the   tFID   (temp)   file is hard coded to the   C:   drive.
Only minimal error checking is performed.

/*REXX program to reads a (DOS) directory  and  finds and displays files that identical.*/
sep=center(' files are identical in size and content: ',79,"═")    /*define the header. */
tFID= 'c:\TEMP\FINDDUP.TMP'                      /*use this as a temporary  FileID.     */
arg maxSize aDir                                 /*obtain optional arguments from the CL*/
if maxSize='' | maxSize="," then maxSize=1000000 /*filesize limit (in bytes) [1 million]*/
aDir=strip(aDir)                                 /*remove any leading or trailing blanks*/
if right(aDir,1)\=='\'  then aDir=aDir"\"        /*possibly add a trailing backslash [\]*/
"DIR"  aDir  '/a-d-s-h /oS /s | FIND "/" >' tFID /*the (DOS) DIR output ───► temp file. */
pFN=                                             /*the previous  filename and filesize. */
pSZ=;  do j=0  while lines(tFID)\==0             /*process each of the files in the list*/
       aLine=linein(tFID)                        /*obtain (DOS) DIR's output about a FID*/
       parse var aLine . . sz fn                 /*obtain the filesize and its fileID.  */
       sz=space(translate(sz,,','),0)            /*elide any commas from the size number*/
       if sz>maxSize  then leave                 /*Is the file > maximum?  Ignore file. */
                                                 /* [↓]  files identical?  (1st million)*/
       if sz==pSZ  then  if charin(aDir||pFN,1,sz)==charin(aDir||FN,1,sz)  then do
                                                                                say sep
                                                                                say pLine
                                                                                say aLine
                                                                                say
                                                                                end
       pSZ=sz;      pFN=FN;      pLine=aLine     /*remember the previous stuff for later*/
       end   /*j*/

if lines(tFID)\==0  then 'ERASE' tFID            /*do housecleaning  (delete temp file).*/
                                                 /*stick a fork in it,  we're all done. */

output   when using (checking) with the default root directory:

══════════════════ files are identical in size and content: ═══════════════════
04/13/2013  19:13                76 another.BK
04/13/2013  19:13                76 another.A

══════════════════ files are identical in size and content: ═══════════════════
04/13/2013  17:15               244 gettfid.1
04/13/2013  17:15               244 junk.1

══════════════════ files are identical in size and content: ═══════════════════
03/03/1995  01:46            10,897 $ERR.BK
03/03/1995  01:46            10,897 $ERR.ORI

with error checking

This version of the REXX program:

  •   checks to see if running under the   DOS   environment
  •   uses the   TEMP   folder for storing a temporary file
  •   verifies that the   maxSize   is a positive integer
  •   adjusts the name for a generic file specification
  •   uses variables for some command names and command options
  •   shows the number of files examined and also the directory name
/*REXX program to reads a (DOS) directory  and  finds and displays files that identical.*/
sep=center(' files are identical in size and content: ',79,"═")    /*define the header. */
parse arg !;     if !all(arg())  then exit                         /*boilerplate HELP(?)*/
signal on halt;  signal on novalue;  signal on syntax              /*handle exceptions, */

if \!dos  then call err 'this program requires the DOS [environment].'
call getTFID                                     /*defines a temporary  File ID for DOS.*/
arg maxSize aDir                                 /*obtain optional arguments from the CL*/
if maxSize='' | maxSize="," then maxSize=1000000 /*filesize limit (in bytes) [1 million]*/
if \isInt(maxSize)      then call err  "maxSize isn't an integer:"       maxSize
if maxSize<0            then call err  "maxSize can't be negative:"      maxSize
if maxSize=0            then call err  "maxSize can't be zero:"          maxSize
aDir=strip(aDir)                                 /*remove any leading or trailing blanks*/
if right(aDir,3)=='*.*' then aDir=substr(aDir,1,length(aDir)-3)   /*adjust the dir name.*/
if right(aDir,1)\=='\'  then aDir=aDir"\"        /*possibly add a trailing backslash [\]*/
@dir    = 'DIR'                                  /*literal for the (DOS)  DIR  command. */
@dirNots= '/a-d-s-h'                             /*ignore DIRs, SYSTEM, and HIDDEN files*/
@dirOpts= '/oS /s'                               /*sort DIR's (+ subdirs) files by size.*/
@filter = '| FIND "/"'                           /*the "lines" must have a slash [/].   */
@erase  = 'ERASE'                                /*literal for the (DOS)  ERASE command.*/
@dir aDir @dirNots @dirOpts @filter '>' tFID     /*(DOS) DIR  output ──► temporary file.*/
pFN=                                             /*the previous  filename and filesize. */
pSZ=;  do j=0  while lines(tFID)\==0             /*process each of the files in the list*/
       aLine=linein(tFID)                        /*obtain (DOS) DIR's output about a FID*/
       parse var aLine . . sz fn                 /*obtain the filesize and its fileID.  */
       sz=space(translate(sz,,','),0)            /*elide any commas from the size number*/
       if sz>maxSize  then leave                 /*Is the file > maximum?  Ignore file. */
                                                 /* [↓]  files identical?  (1st million)*/
       if sz==pSZ  then  if charin(aDir||pFN,1,sz)==charin(aDir||FN,1,sz)  then do
                                                                                say sep
                                                                                say pLine
                                                                                say aLine
                                                                                say
                                                                                end
       pSZ=sz;      pFN=FN;      pLine=aLine     /*remember the previous stuff for later*/
       end   /*j*/

say j  'file's(j)  "examined in"  aDir           /*show information to the screen.*/
if lines(tFID)\==0  then 'ERASE'  tFID           /*do housecleaning  (delete temp file).*/
exit                                             /*stick a fork in it,  we're all done. */
/*═════════════════════════════general 1─line subs══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════*/
!all:  !!=!;!=space(!);upper !;call !fid;!nt=right(!var('OS'),2)=="NT";!cls=word('CLS VMFCLEAR CLRSCREEN',1+!cms+!tso*2);if arg(1)\==1 then return 0;if wordpos(!,"? ?SAMPLES ?AUTHOR ?FLOW")==0 then return 0;!call=']$H';call "$H" !fn !;!call=;return 1
!cal:  if symbol('!CALL')\=="VAR" then !call=; return !call
!env:  !env='ENVIRONMENT'; if !sys=="MSDOS" | !brexx | !r4 | !roo  then !env='SYSTEM'; if !os2  then !env="OS2"!env; !ebcdic=1=='f1'x;     return
!fid:  parse upper source !sys !fun !fid . 1 . . !fn !ft !fm .; call !sys; if !dos  then do; _=lastpos('\',!fn); !fm=left(!fn,_); !fn=substr(!fn,_+1); parse var !fn !fn "." !ft; end;     return word(0 !fn !ft !fm, 1+('0'arg(1)))
!rex:  parse upper version !ver !vernum !verdate .; !brexx='BY'==!vernum; !kexx="KEXX"==!ver; !pcrexx='REXX/PERSONAL'==!ver | "REXX/PC"==!ver; !r4='REXX-R4'==!ver; !regina="REXX-REGINA"==left(!ver,11); !roo='REXX-ROO'==!ver; call !env;   return
!sys:  !cms=!sys=='CMS'; !os2=!sys=="OS2"; !tso=!sys=='TSO' | !sys=="MVS"; !vse=!sys=='VSE'; !dos=pos("DOS",!sys)\==0|pos('WIN',!sys)\==0|!sys=="CMD"; call !rex;                          return
!var:  call !fid; if !kexx  then return space(dosenv(arg(1)));             return space(value(arg(1),,!env))
err:       say;  say;  say  center(' error! ', 60, "*");  say;  do j=1  for arg();  say arg(j);  say;  end;  say;  exit 13
getdTFID:  tfid=p(!var("TMP") !var('TEMP') homedrive()"\"); if substr(tfid,2,1)==':'&substr(tfid,3,1)\=="\" then tfid=insert('\',t,2);        return strip(tfid,"T",'\')"\"arg(1)'.'arg(2)
getTFID:   if symbol('TFID')=="LIT" then tfid=; if tfid\=='' then return tfid; gfn=word(arg(1) !fn,1);gft=word(arg(2) "TMP",1); tfid='TEMP';if !tso  then tfid=gfn"."gft;if !cms  then tfid=gfn','gft",A4";if !dos then tfid=getdTFID(gfn,gft);return tfid
halt:      call err 'program has been halted.'
homedrive: if symbol('HOMEDRIVE')\=="VAR"  then homedrive=p(!var('HOMEDRIVE') "C:");   return homedrive
isint:     return datatype(arg(1),'W')
novalue:   syntax:   call err 'REXX program' condition("C") 'error',condition("D"),'REXX source statement (line' sigl"):",sourceline(sigl)
p:         return word(arg(1),1)
s:         if arg(1)==1  then return arg(3);   return word(arg(2) 's',1)

output   when using the DIR (folder):   H:\#\REX

══════════════════ files are identical in size and content: ═══════════════════
05/11/2015  18:49               838 UPDATECF.BU
05/11/2015  18:49               838 UPDATECF.TXT

══════════════════ files are identical in size and content: ═══════════════════
03/23/2014  21:55             2,736 EMIRP.RX_
03/26/2014  10:44             2,736 EMIRP2.RX_

══════════════════ files are identical in size and content: ═══════════════════
05/30/2015  17:30             4,542 JUSTIFY.RX_
11/25/2013  06:33             4,542 JUSTIFY.KX_

══════════════════ files are identical in size and content: ═══════════════════
06/15/2014  23:36            13,935 $BLOCK.KX_
05/30/2015  17:28            13,935 $BLOCK.RX_

1568 files examined in H:\#\REX\

Ring

# Project : Find duplicate files

d = "/Windows/System32"
chdir(d)
dir = dir(d)
dirlist = []
for n = 1 to len(dir)
     if dir[n][2] = 0
        str = read(dir[n][1])
        lenstr = len(str)
        add(dirlist,[lenstr,dir[n][1]])
     ok
next
see "Directory : " + d + nl
see "--------------------------------------------" + nl
dirlist = sortfirst(dirlist)
line = 0
for n = 1 to len(dirlist)-1
     if dirlist[n][1] = dirlist[n+1][1]
        see "" + dirlist[n][1] + " " + dirlist[n][2] + nl
        see "" + dirlist[n+1][1] + " " + dirlist[n+1][2] + nl
        if n < len(dirlist)-2 and dirlist[n+1][1] != dirlist[n+2][1]
           line = 1
        ok
     else
        line = 0
     ok
     if line = 1
        see "--------------------------------------------" + nl
     ok
next

func sortfirst(alist)
        for n = 1 to len(alist) - 1
             for m = n + 1 to len(alist)
                  if alist[m][1] < alist[n][1]
                     swap(alist,m,n)
                  ok
                  if alist[m][1] = alist[n][1] and strcmp(alist[m][2],alist[n][2]) < 0
                     swap(alist,m,n)
                  ok
             next
        next
        return alist

Output:

Directory : /Windows/System32
--------------------------------------------
0 nsprs.dll
0 nsprs.tgz
0 nsprs.tgz
0 serauth1.dll
0 serauth1.dll
0 serauth2.dll
--------------------------------------------
16 jm1ixs2.dll
16 qmtn7ft.dll
--------------------------------------------
......
--------------------------------------------
1189376 Windows.Globalization.dll
1189376 wscui.cpl
--------------------------------------------
1192448 Windows.UI.Xaml.Maps.dll
1192448 dfshim.dll
--------------------------------------------
1295360 MSVPXENC.dll
1295360 comres.dll
--------------------------------------------
1311744 SensorsCpl.dll
1311744 msjet40.dll
--------------------------------------------

Ruby

It confirms once by the file size. When the same, it confirms a digest (md5).

require 'digest/md5'

def find_duplicate_files(dir)
  puts "\nDirectory : #{dir}"
  Dir.chdir(dir) do
    file_size = Dir.foreach('.').select{|f| FileTest.file?(f)}.group_by{|f| File.size(f)}
    file_size.each do |size, files|
      next if files.size==1
      files.group_by{|f| Digest::MD5.file(f).to_s}.each do |md5,fs|
        next if fs.size==1
        puts "  --------------------------------------------"
        fs.each{|file| puts "  #{File.mtime(file)}  #{size}  #{file}"}
      end
    end
  end
end

find_duplicate_files("/Windows/System32")

Sample Output:

Directory : /Windows/System32
  --------------------------------------------
  2016-02-09 18:56:09 +0900  5120  dxmasf.dll
  2016-02-09 18:56:09 +0900  5120  msdxm.ocx
  --------------------------------------------
  2015-11-14 08:09:16 +0900  91648  mapi32.dll
  2015-11-14 08:09:16 +0900  91648  mapistub.dll
  --------------------------------------------
  2015-11-05 20:34:06 +0900  18592  msvcp110_clr0400.dll
  2015-11-05 20:34:06 +0900  18592  msvcr100_clr0400.dll
  2015-11-05 20:34:06 +0900  18592  msvcr110_clr0400.dll
  --------------------------------------------
  2009-07-14 10:00:32 +0900  31548  perfd009.dat
  2010-11-21 16:14:04 +0900  31548  perfd011.dat

It checked the operation with MS Windows 7.

Rust

use std::{
    collections::BTreeMap,
    fs::{read_dir, File},
    hash::Hasher,
    io::Read,
    path::{Path, PathBuf},
};

type Duplicates = BTreeMap<(u64, u64), Vec<PathBuf>>;

struct DuplicateFinder {
    found: Duplicates,
    min_size: u64,
}

impl DuplicateFinder {
    fn search(path: impl AsRef<Path>, min_size: u64) -> std::io::Result<Duplicates> {
        let mut result = Self {
            found: BTreeMap::new(),
            min_size,
        };

        result.walk(path)?;
        Ok(result.found)
    }

    fn walk(&mut self, path: impl AsRef<Path>) -> std::io::Result<()> {
        let listing = read_dir(path.as_ref())?;
        for entry in listing {
            let entry = entry?;
            let path = entry.path();
            if path.is_dir() {
                self.walk(path)?;
            } else {
                self.compute_digest(&path)?;
            }
        }

        Ok(())
    }

    fn compute_digest(&mut self, file: &Path) -> std::io::Result<()> {
        let size = file.metadata()?.len();
        if size < self.min_size {
            return Ok(());
        }

        // This hasher is weak, we could otherwise use an external crate
        let mut hasher = std::collections::hash_map::DefaultHasher::default();
        let mut bytes = [0u8; 8182];
        let mut f = File::open(file)?;
        loop {
            let n = f.read(&mut bytes[..])?;
            hasher.write(&bytes[..n]);
            if n == 0 {
                break;
            }
        }

        let hash = hasher.finish();

        self.found
            .entry((size, hash))
            .or_insert_with(Vec::new)
            .push(file.to_owned());

        Ok(())
    }
}

fn main() -> std::io::Result<()> {
    let mut args = std::env::args();

    args.next(); // Skip the executable name
    let dir = args.next().unwrap_or_else(|| ".".to_owned());

    let min_size = args
        .next()
        .and_then(|arg| arg.parse::<u64>().ok())
        .unwrap_or(0u64);

    DuplicateFinder::search(dir, min_size)?
        .iter()
        .rev()
        .filter(|(_, files)| files.len() > 1)
        .for_each(|((size, _), files)| {
            println!("Size: {}", size);

            files
                .iter()
                .for_each(|file| println!("{}", file.to_string_lossy()));

            println!();
        });

    Ok(())
}

Sidef

It uses the portable File::Find module which means that it should work, virtually, on any platform.

# usage: sidef fdf.sf [size] [dir1] [...]

require('File::Find')

func find_duplicate_files(Block code, size_min=0, *dirs) {
    var files = Hash()
    %S<File::Find>.find(
        Hash(
            no_chdir => true,
            wanted   => func(arg) {
                var file = File(arg)
                file.is_file || return()
                file.is_link && return()
                var size = file.size
                size >= size_min || return()
                files{size} := [] << file
            },
        ) => dirs...
    )

    files.values.each { |set|
        set.len > 1 || next
        var dups = Hash()
        for i in (^set.end) {
            for (var j = set.end; j > i; --j) {
                if (set[i].compare(set[j]) == 0) {
                    dups{set[i]} := [] << set.pop_at(j++)
                }
            }
        }
        dups.each{ |k,v| code(k.to_file, v...) }
    }

    return()
}

var duplicates = Hash()
func collect(*files) {
    duplicates{files[0].size} := [] << files
}

find_duplicate_files(collect, Num(ARGV.shift), ARGV...)

for k,v in (duplicates.sort_by { |k| -k.to_i }) {
    say "=> Size: #{k}\n#{'~'*80}"
    for files in v {
        say "#{files.sort.join(%Q[\n])}\n#{'-'*80}"
    }
}

Section of sample output:

% sidef fdf.sf 0 /tmp /usr/bin
=> Size: 5656
~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/bin/precat
/usr/bin/preunzip
/usr/bin/prezip
--------------------------
=> Size: 2305
~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/bin/gunzip
/usr/bin/uncompress
--------------------------
=> Size: 2
~~~~~~~~~~~~~~~~~~~~~~~~~~
/tmp/a.txt
/tmp/b.txt
--------------------------
/tmp/m.txt
/tmp/n.txt
--------------------------

Tcl

Only known to work on Unix. Uses both inode number checking and content hashing to do duplicate detection.

Library: Tcllib (Package: fileutil)
Library: Tcllib (Package: md5)
package require fileutil
package require md5

proc finddupfiles {dir {minsize 1}} {
    foreach fn [fileutil::find $dir] {
    file lstat $fn stat
    if {$stat(size) < $minsize} continue
    dict lappend byino $stat(dev),$stat(ino) $fn
    if {$stat(type) ne "file"} continue
    set f [open $fn "rb"]
    set content [read $f]
    close $f
    set md5 [md5::md5 -hex $content]
    dict lappend byhash $md5 $fn
    }
    set groups {}
    foreach group [dict values $byino] {
    if {[llength $group] <= 1} continue
    set gs [lsort $group]
    dict set groups [lindex $gs 0] $gs
    }
    foreach group [dict values $byhash] {
    if {[llength $group] <= 1} continue
    foreach f $group {
        if {[dict exists $groups $f]} {
        dict set groups $f [lsort -unique \
            [concat [dict get $groups $f] $group]]
        unset group
        break
        }
    }
    if {[info exist group]} {
        set gs [lsort $group]
        dict set groups [lindex $gs 0] $gs
    }
    }
    set masters {}
    dict for {n g} $groups {
    lappend masters [list $n [llength $g],$n]
    }
    set result {}
    foreach p [lsort -decreasing -index 1 -dictionary $masters] {
    set n [lindex $p 0]
    lappend result $n [dict get $groups $n]
    }
    return $result
}

foreach {leader dupes} [finddupfiles {*}$argv] {
    puts "$leader has duplicates"
    set n 0
    foreach d $dupes {
    if {$d ne $leader} {
        puts "   dupe #[incr n]: $d"
    }
    }
}

Section of sample output:

./compat/zlib/zconf.h has duplicates
   dupe #1: ./compat/zlib/zconf.h.in
./compat/zlib/contrib/vstudio/vc10/zlib.rc has duplicates
   dupe #1: ./compat/zlib/contrib/vstudio/vc9/zlib.rc
./compat/zlib/contrib/delphi/zlibd32.mak has duplicates
   dupe #1: ./compat/zlib/contrib/pascal/zlibd32.mak

Wren

Library: Wren-crypto
Library: Wren-sort
import "io" for Directory, File, Stat
import "./crypto" for Sha1
import "./sort" for Sort

var findDuplicates = Fn.new { |dir, minSize|
    if (!Directory.exists(dir)) Fiber.abort("Directory does not exist.")
    var files = Directory.list(dir).where { |f| Stat.path("%(dir)/%(f)").size >= minSize }
    var hashMap = {}
    for (file in files) {
        var path = "%(dir)/%(file)"
        if (Stat.path(path).isDirectory) continue
        var contents = File.read(path)
        var hash = Sha1.digest(contents)
        var exists = hashMap.containsKey(hash)
        if (exists) {
            hashMap[hash].add(file)
        } else {
            hashMap[hash] = [file]        
        }
    }
    var duplicates  = []
    for (key in hashMap.keys) {
        if (hashMap[key].count > 1) {
            var files = hashMap[key]
            var path = "%(dir)/%(files[0])"
            var size = Stat.path(path).size
            duplicates.add([size, files])
        }
    }
    var cmp = Fn.new { |i, j| (j[0] - i[0]).sign } // by decreasing size
    Sort.insertion(duplicates, cmp)
    System.print("The sets of duplicate files are:\n")
    for (dup in duplicates) {
        System.print("Size %(dup[0]) bytes:")
        System.print(dup[1].join("\n"))
        System.print()
    }
}

findDuplicates.call("./", 1000)
Output:

Sample output:

The sets of duplicate files are:

Size 57221 bytes:
big.wren
big2.wren

Size 16696 bytes:
cls
clsc

Size 4096 bytes:
data.blk
data2.blk

Size 1415 bytes:
circular.wren
circular2.wren

zkl

Uses the MsgHash dll so I don't have to read the entire file to hash it (the built in MD5 only hashes one blob, MshHash can hash a chunked blob).

I tried threading this but, even though it was over twice as fast, not really worth it.

File findDupFiles.zkl:

include(zkl.h.zkl);
const FLAGS=FILE.GLOB.IGNORE_CASE + FILE.GLOB.NO_DIRS;
var [const] MsgHash=Import("zklMsgHash");
var recurse=False, fileSpec, minSz=0, maxSz=(0).MAX;

argh:=Utils.Argh(
   T("+R","R","Recurse into subdirectories, starting at <arg>",
    fcn(arg){ recurse=arg }),
   T("+minSz","","Only consider files larger than <arg>",
    fcn(arg){ minSz=arg.toInt() }),
   T("+maxSz","","Only consider files less than <arg>",
    fcn(arg){ maxSz=arg.toInt() }),
);

argh.parse(vm.arglist);
try { fileSpec=argh.loners[0]; }
catch{
   argh.usage("Find duplicate files");
   System.exit(1);
}

fnames:=Data(0,String);
if (recurse) File.globular(recurse,fileSpec,True,FLAGS,fnames);
else         File.glob(fileSpec,FLAGS).pump(fnames);

files:=Dictionary();  // (len:(name,name...), ...)
foreach fname in (fnames){
   sz:=File.len(fname);
   if(minSz<=sz<=maxSz) files.appendV(File.len(fname),fname);
}

    //////////////////////// group files by size
files=files.pump(List,Void.Xplode,fcn(k,v){ v.len()>1 and v or Void.Skip });
println("Found %d groups of same sized files, %d files total.".fmt(files.len(),
   files.apply("len").sum(0)));

if(not files) System.exit();    // no files found

buffer:=Data(0d100_000);  // we'll resuse this buffer for hashing
hashes:=files.pump(List,'wrap(fnames){ // get the MD5 hash for each file
   fnames.pump(List,'wrap(fname){
      file,hash := File(fname,"rb"), MsgHash.toSink("MD5");
      file.pump(buffer,hash); file.close();
      return(hash.close(),fname); // -->( (hash,name), (hash,name) ... )
   })
},T(Void.Write,Void.Write)); // flatten list of lists of lists to above

   // Hash the file hashes, then scoop out the files with the same hash
buffer:=Dictionary();
files:=hashes.pump(Void,Void.Xplode,buffer.appendV)
       .pump(List,Void.Xplode,fcn(k,v){ v.len()>1 and v or Void.Skip });
  
println("Found %d duplicate files:".fmt(files.apply("len").sum(0)));
foreach group in (files){ println("   ",group.concat(", ")) }
Output:
$ zkl findDupFiles.zkl
Find duplicate files
Options:
  --R (-R) <arg>: Recurse into subdirectories, starting at <arg>
  --maxSz <arg>: Only consider files less than <arg>
  --minSz <arg>: Only consider files larger than <arg>

$ zkl findDupFiles.zkl '*' 
Found 16 groups of same sized files, 34 files total.
Found 8 duplicate files:
   unixdict.txt, dict.txt, uuuu.txt
   zz.zkl, zzDup.zkl
   gooperf.dat, zklTmpFile082p1V, test.dat