Find duplicate files: Difference between revisions

Added Java Solution
m (Promote to task, lots of examples, little controversy)
(Added Java Solution)
Line 246:
Something went wrong - please use ./finddups <dir> <bytes>
 
</pre>
 
=={{header|Java}}==
This should work on with any OS or filesystem supported by Java.
Hard links are indicated by displaying the files on the same line separated by "=".
MD5 checksums are used to detect duplicate files.
<lang java>import java.io.*;
import java.nio.*;
import java.nio.file.*;
import java.nio.file.attribute.*;
import java.security.*;
import java.util.*;
 
public class DuplicateFiles {
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Directory name and minimum file size are required.");
System.exit(1);
}
try {
findDuplicateFiles(args[0], Long.parseLong(args[1]));
} catch (Exception e) {
e.printStackTrace();
}
}
 
private static void findDuplicateFiles(String directory, long minimumSize)
throws IOException, NoSuchAlgorithmException {
System.out.println("Directory: '" + directory + "', minimum size: " + minimumSize + " bytes.");
Path path = FileSystems.getDefault().getPath(directory);
FileVisitor visitor = new FileVisitor(path, minimumSize);
Files.walkFileTree(path, visitor);
System.out.println("The following sets of files have the same size and checksum:");
for (Map.Entry<FileKey, Map<Object, List<String>>> e : visitor.fileMap_.entrySet()) {
Map<Object, List<String>> map = e.getValue();
if (!containsDuplicates(map))
continue;
List<List<String>> fileSets = new ArrayList<>(map.values());
Collections.sort(fileSets, new StringListComparator());
FileKey key = e.getKey();
System.out.println();
System.out.println("Size: " + key.size_ + " bytes");
for (List<String> files : fileSets) {
Collections.sort(files);
for (int i = 0, n = files.size(); i < n; ++i) {
if (i > 0)
System.out.print(" = ");
System.out.print(files.get(i));
}
System.out.println();
}
}
}
 
private static class StringListComparator implements Comparator<List<String>> {
public int compare(List<String> a, List<String> b) {
int len1 = a.size(), len2 = b.size();
for (int i = 0; i < len1 && i < len2; ++i) {
int c = a.get(i).compareTo(b.get(i));
if (c != 0)
return c;
}
return Integer.compare(len1, len2);
}
}
 
private static boolean containsDuplicates(Map<Object, List<String>> map) {
if (map.size() > 1)
return true;
for (List<String> files : map.values()) {
if (files.size() > 1)
return true;
}
return false;
}
 
private static class FileVisitor extends SimpleFileVisitor<Path> {
private MessageDigest digest_;
private Path directory_;
private long minimumSize_;
private Map<FileKey, Map<Object, List<String>>> fileMap_ = new TreeMap<>();
 
private FileVisitor(Path directory, long minimumSize) throws NoSuchAlgorithmException {
directory_ = directory;
minimumSize_ = minimumSize;
digest_ = MessageDigest.getInstance("MD5");
}
 
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
if (attrs.size() >= minimumSize_) {
FileKey key = new FileKey(file, attrs, getMD5Sum(file));
Map<Object, List<String>> map = fileMap_.get(key);
if (map == null)
fileMap_.put(key, map = new HashMap<>());
List<String> files = map.get(attrs.fileKey());
if (files == null)
map.put(attrs.fileKey(), files = new ArrayList<>());
Path relative = directory_.relativize(file);
files.add(relative.toString());
}
return FileVisitResult.CONTINUE;
}
 
private byte[] getMD5Sum(Path file) throws IOException {
digest_.reset();
try (DigestInputStream in = new DigestInputStream(new BufferedInputStream(new
FileInputStream(file.toString())), digest_)) {
while (in.read() != -1) {}
}
return digest_.digest();
}
}
 
private static class FileKey implements Comparable<FileKey> {
private byte[] hash_;
private long size_;
 
private FileKey(Path file, BasicFileAttributes attrs, byte[] hash) throws IOException {
size_ = attrs.size();
hash_ = hash;
}
 
public int compareTo(FileKey other) {
int c = Long.compare(other.size_, size_);
if (c == 0)
c = hashCompare(hash_, other.hash_);
return c;
}
}
 
private static int hashCompare(byte[] a, byte[] b) {
int len1 = a.length, len2 = b.length;
for (int i = 0; i < len1 && i < len2; ++i) {
int c = Byte.compare(a[i], b[i]);
if (c != 0)
return c;
}
return Integer.compare(len1, len2);
}
}</lang>
 
{{out}}
<pre>
Directory: 'test', minimum size: 1000 bytes.
The following sets of files have the same size and checksum:
 
Size: 16370 bytes
file2
file4
file5
 
Size: 8188 bytes
file1 = file3
file6
</pre>
 
1,777

edits