aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Ryder <tom@sanctum.geek.nz>2011-04-18 14:40:08 +1200
committerTom Ryder <tom@sanctum.geek.nz>2011-04-18 14:40:08 +1200
commitb06ce99e8bddce5538f3d441fb2f7356f05589c5 (patch)
tree5aed596fd46524fc1bd79f173aab5b072b2553af
downloadcheckem-b06ce99e8bddce5538f3d441fb2f7356f05589c5.tar.gz (sig)
checkem-b06ce99e8bddce5538f3d441fb2f7356f05589c5.zip
First commit.v1.0
-rw-r--r--README2
-rwxr-xr-xcheckem.pl171
2 files changed, 173 insertions, 0 deletions
diff --git a/README b/README
new file mode 100644
index 0000000..16926fb
--- /dev/null
+++ b/README
@@ -0,0 +1,2 @@
+Efficiently find duplicate files using Perl on Linux. See comments for
+further details.
diff --git a/checkem.pl b/checkem.pl
new file mode 100755
index 0000000..793a7f0
--- /dev/null
+++ b/checkem.pl
@@ -0,0 +1,171 @@
+#! /usr/bin/perl
+
+#
+# checkem - Find duplicate files with standard Perl libraries
+#
+# Tom Ryder (tejr) 2011
+#
+# This script parses a whole directory tree for duplicate files. It uses an
+# efficient method of checking filesizes first, and hashing only when
+# filesizes match. It returns files that are duplicates in groups delineated
+# by blank lines to stdout.
+#
+# Optional first argument is the directory tree to scan. Defaults to pwd.
+# Optional remaining arguments are filename regexes to exclude. Defaults to
+# null.
+#
+
+# Force me to write this properly.
+use warnings;
+use strict;
+
+# Get the find and current working directory modules.
+use File::Find;
+use Cwd;
+
+# Initialise global directory name scalar.
+my $dir;
+
+# Initialise a couple of important global-scope hashes.
+my (%sizes, %matches);
+
+# Open output as stdout. You could change this, but it's easier just to pipe it.
+open OUTPUT, '>>', '/dev/stdout';
+
+# If no arguments, work with the current working directory.
+if (! $ARGV[0]) {
+ $dir = cwd;
+}
+
+# If arguments, check for existence, then confirm as directory to scan.
+elsif (-d $ARGV[0]) {
+ $dir = shift @ARGV;
+}
+
+# If it doesn't exist, halt.
+else {
+ print 'Directory '.$ARGV[0].' not found.'."\n";
+ exit 1;
+}
+
+# Any further arguments are regex exclusions. This could very well be null. That's fine.
+my @exclusions = @ARGV;
+
+# Declare a sub that returns filesizes.
+sub filesize {
+ return (stat(shift))[7];
+}
+
+# Declare a sub that returns inodes.
+sub inode {
+ return (stat(shift))[1];
+}
+
+# Declare a sub that returns checksums.
+sub checksum {
+ my $file = shift;
+ return substr(`md5sum "$file"`, 0, 32);
+}
+
+# Declare the wanted sub ...
+sub wanted {
+
+ # If it's a file ...
+ if (-f $File::Find::name)
+ {
+ # Just to make this easier ...
+ my $file = $File::Find::name;
+
+ # Check it doesn't match any exclusions.
+ my $exclude = 0;
+ foreach my $exclusion (@exclusions) {
+ $exclude = 1 if ($file =~ m/$exclusion/);
+ }
+
+ # No? Good! Start processing it.
+ if (! $exclude)
+ {
+ # Get filesize and add it to hash.
+ my $size = filesize($file);
+ $sizes{ $file } = $size;
+
+ # Start a matches array reference.
+ $matches{ $file } = [ ];
+ }
+ }
+}
+
+# ... and start the find process on the directory!
+find \&wanted, $dir;
+
+# Get a sorted list of the sizes keys by filesize. This groups them nicely.
+my @files_sorted = sort {$sizes{$a} <=> $sizes{$b}} keys %sizes;
+
+# Start with an impossible filesize and a null filename.
+my $any_matches = 0;
+my $current_size = -1;
+my $current_matching;
+
+# For each of the files in that sorted array,
+foreach my $file (@files_sorted) {
+
+ # ... check that:
+ if (
+ # The filesizes according to the %sizes hash match;
+ $sizes{$file} == $current_size
+ and
+ # ... they aren't the same inode, in which case it's probably an intentional hard link;
+ inode($current_matching) != inode($file)
+ and
+ # ... the MD5 hashes match.
+ checksum($current_matching) eq checksum($file)
+ )
+ {
+ # If so, conclude it's a duplicate, and add it to the matches for this file.
+ $any_matches = 1;
+ push @{$matches{$current_matching}}, $file;
+ }
+
+ # Otherwise, start a new group and keep iterating.
+ else
+ {
+ $current_matching = $file;
+ $current_size = $sizes{$file}
+ }
+}
+
+# The flag for any matches is still false? Success!
+if (! $any_matches) {
+ print 'No matches.', "\n";
+ exit 0;
+}
+
+# We don't need these now.
+undef $any_matches;
+undef $current_matching;
+undef $current_size;
+undef $dir;
+undef @exclusions;
+undef @files_sorted;
+undef %sizes;
+
+# Start printing some output.
+print OUTPUT "\n", 'Summary of matches:', "\n\n";
+
+# Print each group of duplicate files!
+foreach my $match (keys %matches) {
+ # Glean the matches for this file by dereferencing that array reference from the matches hash.
+ my @this_matches = @{$matches{$match}};
+
+ # If there are matches, print them.
+ if ($#this_matches > -1) {
+ print OUTPUT $match, "\n";
+ print OUTPUT $_, "\n" foreach (@this_matches);
+ print OUTPUT "\n";
+ }
+}
+
+# Close the output, whatever it is, and we're done.
+close OUTPUT;
+exit 0;
+