#!/usr/bin/env perl # # checkem: Find groups of duplicate files with core libraries. # # Author: Tom Ryder # Site: # # Package name package File::Duplicates::Checkem; # Force me to write this properly use strict; use warnings; use utf8; # Tolerate very old Perls use 5.006; # Import modules; Digest is the only one that wasn't in Perl 5.6 core use Carp; use Fcntl ':mode'; use File::Find; use Digest; # Version number to make Perl::Critic happy our $VERSION = 2.12; # If no arguments, work with the current working directory if ( !@ARGV ) { printf {*STDERR} "%s\n", 'Need at least one file or directory'; exit 2; } # Convenience keys into stat() return array for clarity and to appease # Perl::Critic my %STATS = ( dev => 0, ino => 1, mode => 2, size => 7, ); # We need to pick and create a Digest object my $dig; # We were told which algorithm to use if ( exists $ENV{CHECKEM_ALG} ) { $dig = Digest->new( $ENV{CHECKEM_ALG} ); } # Try worse and worse algorithms until we get a digest object else { ALG: for my $alg (qw(SHA-256 SHA-1 MD5)) { next ALG if not eval { $dig = Digest->new($alg); }; } } # Still no digest object, give up if ( !defined $dig ) { croak 'Could not create a useable Digest object'; } # Start a hash of filesizes to file names/stats... my %sizes; # ...and fill it up with File::Find. find { no_chdir => 1, wanted => sub { # Start a hash to represent this file my %f = ( name => $File::Find::name, ); # Pull in the file stat values we care about @f{ keys %STATS } = ( stat $f{name} )[ values %STATS ] or return; # Check it's a regular file return if not $f{mode} & S_IFREG; # Check its size is non-zero return if not $f{size}; # Push the file hash into its size's bucket return push @{ $sizes{ $f{size} } }, \%f; }, }, @ARGV; # If there's more than one filename of any of the sizes, look for hard links, # checksum them if not linked, and push them into a sums table my %sums; SIZE: for my $fs ( grep { @{$_} > 1 } values %sizes ) { # Keep a temporary table of inodes to catch hard links my %inos; # Iterate through each file in the list FILE: for my $f ( @{$fs} ) { # Catch hard links on compliant systems by keeping a dev/inode hash my ( $dev, $ino ) = @{$f}{qw(dev ino)}; if ( $dev && $ino ) { next if exists $inos{$dev}{$ino}; $inos{$dev}{$ino} = $f; } # Files still the same size and not hard linked, group by digest; # create the digest object if it isn't already defined if ( open my $fh, '<', $f->{name} ) { binmode $fh; $dig->addfile($fh); push @{ $sums{ $dig->digest() } }, $f; close $fh or carp 'Failed to close file'; } else { carp 'Failed to open file'; } } } # Print the groups of matched files (more than one share a checksum in the # final table); sort the blocks by the filesize, and the files within each # block by name GROUP: for my $group ( sort { $a->[0]{size} <=> $b->[0]{size} } grep { @{$_} > 1 } values %sums ) { printf "%s\n\n", join "\n", sort map { $_->{name} } @{$group}; }