#!/usr/bin/env perl
#
# checkem: Find groups of duplicate files with core libraries.
#
# Author: Tom Ryder <tom@sanctum.geek.nz>
# Site: <https://sanctum.geek.nz/cgit/checkem.git>
#
package main;
# Force me to write this properly
use strict;
use warnings;
use utf8;
# Tolerate very old Perls
use 5.006;
# Import modules; Digest is the only one that wasn't in Perl 5.6 core
use Digest;
use English '-no_match_vars';
use Fcntl ':mode';
use File::Find;
# Version number to make Perl::Critic happy
our $VERSION = 2.18;
# Complain if there are no arguments
if ( !@ARGV ) {
printf {*STDERR} "Need at least one file or directory\n";
exit 2;
}
# Convenience keys into lstat() return array for clarity and to appease
# Perl::Critic
my %STATS = (
dev => 0,
ino => 1,
mode => 2,
size => 7,
);
# Use either the specified algorithm or a default list
my @algs =
exists $ENV{CHECKEM_ALG}
? $ENV{CHECKEM_ALG}
: qw(SHA-256 SHA-1 MD5);
# Build digest object or give up
my $dig;
for (@algs) {
last if eval { $dig = Digest->new($_) };
}
defined $dig
or die "Could not create a useable Digest object\n";
# Start a hash of filesizes to file names/stats...
my %sizes;
# ...and fill it up with File::Find.
find {
no_chdir => 1,
wanted => sub {
# Start a hash to represent this file
my %f = ( name => $File::Find::name );
# Pull in the stat values we care about
if ( @f{ keys %STATS } = ( lstat $f{name} )[ values %STATS ] ) {
# Check it's a plain old file
return if not S_ISREG( $f{mode} );
# Check its size is non-zero
return if not $f{size};
# Push the file hash into its size's bucket
return push @{ $sizes{ $f{size} } }, \%f;
}
# Complain that we couldn't stat
else {
warn "Could not stat $f{name}: $ERRNO\n";
}
# Return if we got to here
return;
},
}, @ARGV;
# If there's more than one filename of any of the sizes, look for hard links,
# checksum them if not linked, and push them into a sums table
my %sums;
SIZE: for my $fs ( grep { @{$_} > 1 } values %sizes ) {
# Keep a temporary table of inodes to catch hard links
my %inos;
# Iterate through each file in the list
FILE: for my $f ( @{$fs} ) {
# Catch hard links on compliant systems by keeping a dev/inode hash
my ( $dev, $ino ) = @{$f}{qw(dev ino)};
if ( $dev and $ino ) {
next if exists $inos{$dev}{$ino};
$inos{$dev}{$ino} = $f;
}
# Files still the same size and not hard linked, group by digest
if ( open my $fh, '<', $f->{name} ) {
binmode $fh;
$dig->addfile($fh);
push @{ $sums{ $f->{hexdigest} = $dig->hexdigest() } }, $f;
close $fh
or warn "Could not close $f->{name}: $ERRNO\n";
}
else {
warn "Could not open $f->{name}: $ERRNO\n";
}
}
}
# Print the groups of matched files (more than one share a checksum in the
# final table); sort the blocks by the filesize, and the files within each
# block by name
GROUP:
for my $group (
sort { $a->[0]{size} <=> $b->[0]{size} }
grep { @{$_} > 1 } values %sums
)
{
printf "%s\n\n", join "\n", sort map { $_->{name} } @{$group};
}