#!/usr/bin/env perl # # checkem: Find groups of duplicate files with libraries that have been in Perl # Core since version 5.9.3 # # Package name package File::Checkem; # Force me to write this properly use strict; use warnings; use utf8; # Get the find and current working directory modules use Carp; use Fcntl ':mode'; use File::Find; use Digest::SHA; # Lowest version number that has all of those core modules; Digest::SHA is the # newest use 5.009003; # Version number to make Perl::Critic happy our $VERSION = 2.2; # If no arguments, work with the current working directory croak('Need at least one dir to search') if !@ARGV; # Convenience keys into stat() return array for clarity and to appease # Perl::Critic my %STATS = ( dev => 0, ino => 1, mode => 2, size => 7, ); # Start a hash of filesizes to file names/stats... my %sizes; # ...and fill it up with File::Find. find { no_chdir => 1, wanted => sub { my $fn = $File::Find::name; my @st = stat $fn or return; return if !( $st[ $STATS{mode} ] & S_IFREG ); return if !$st[ $STATS{size} ]; push @{ $sizes{ $st[ $STATS{size} ] } }, { fn => $fn, st => \@st, }; return; }, }, @ARGV; # If there's more than one filename of any of the sizes, look for hard links, # checksum them if not linked, and push them into a sums table my ( %sums, $dig ); FS: for my $fs ( grep { @{$_} > 1 } values %sizes ) { # Keep a temporary table of inodes to catch hard links my %inos; # Iterate through each file in the list F: for my $f ( @{$fs} ) { # Catch hard links on compliant systems by keeping a dev/inode hash my ( $dev, $ino ) = @{ $f->{st} }[ @STATS{qw(dev ino)} ]; if ( $dev && $ino ) { next F if exists $inos{$dev}{$ino}; $inos{$dev}{$ino} = $f; } # Files still the same size and not hard linked, group by digest $dig //= Digest::SHA->new('sha256'); $dig->addfile( $f->{fn} ); push @{ $sums{ $dig->digest() } }, $f; } } # Print the groups of matched files (more than one share a checksum in the # final table) for my $group ( grep { @{$_} > 1 } values %sums ) { printf "%s\n\n", join "\n", map { $_->{fn} } @{$group}; }