#!/usr/bin/env perl # # checkem: Find groups of duplicate files with libraries that have been in Perl # Core since version 5.9.3 # # Package name package File::Checkem; # Force me to write this properly use strict; use warnings; use utf8; # Get the find and current working directory modules use Carp; use Fcntl ':mode'; use File::Find; use Digest::SHA; # Lowest version number that has all of those core modules; Digest::SHA is the # newest use 5.009003; # Version number to make Perl::Critic happy our $VERSION = 2.4; # If no arguments, work with the current working directory croak 'Need at least one dir to search' if !@ARGV; # Convenience keys into stat() return array for clarity and to appease # Perl::Critic my %STATS = ( dev => 0, ino => 1, mode => 2, size => 7, ); # Figure out the SHA algorithm to use; defaults to sha256, but can be overriden # by setting CHECKEM_ALG in the environment to e.g. "sha1", which is slightly # faster but technically broken in practice since early 2017 my $alg = $ENV{CHECKEM_ALG} // 'sha256'; # Start a hash of filesizes to file names/stats... my %sizes; # ...and fill it up with File::Find. find { no_chdir => 1, wanted => sub { my $fn = $File::Find::name; # Keep only the stat values we will actually need my %st; @st{ keys %STATS } = ( stat $fn )[ values %STATS ] or return; # Check it's a regular file return if not $st{mode} & S_IFREG; # Check its size is non-zero return if not $st{size}; # Push the filename and the stats into this size's bucket return push @{ $sizes{ $st{size} } }, { fn => $fn, st => \%st, }; }, }, @ARGV; # If there's more than one filename of any of the sizes, look for hard links, # checksum them if not linked, and push them into a sums table my ( %sums, $dig ); for my $fs ( grep { @{$_} > 1 } values %sizes ) { # Keep a temporary table of inodes to catch hard links my %inos; # Iterate through each file in the list for my $f ( @{$fs} ) { # Catch hard links on compliant systems by keeping a dev/inode hash my ( $dev, $ino ) = @{ $f->{st} }{qw(dev ino)}; if ( $dev && $ino ) { next if exists $inos{$dev}{$ino}; $inos{$dev}{$ino} = $f; } # Files still the same size and not hard linked, group by digest; # create the digest object if it isn't already defined ( $dig //= Digest::SHA->new($alg) )->addfile( $f->{fn} ); push @{ $sums{ $dig->digest() } }, $f; } } # Print the groups of matched files (more than one share a checksum in the # final table) for my $group ( grep { @{$_} > 1 } values %sums ) { printf "%s\n\n", join "\n", map { $_->{fn} } @{$group}; }