aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Ryder <tom@sanctum.geek.nz>2017-07-29 17:38:12 +1200
committerTom Ryder <tom@sanctum.geek.nz>2017-07-29 17:48:59 +1200
commit27d5f435fad45378e289461040dd7ab2588d6a46 (patch)
tree07e37471227f002d25d227206c2dbaab81eec0d1
parentCan just use stat now (diff)
downloadcheckem-2.1.tar.gz (sig)
checkem-2.1.zip
Refactor, version bumpv2.1
Do initial size filtering and collection in the wanted sub, and delay inode checking until filtering each size
-rwxr-xr-xcheckem63
1 files changed, 29 insertions, 34 deletions
diff --git a/checkem b/checkem
index 1aa05db..4f8fa84 100755
--- a/checkem
+++ b/checkem
@@ -19,15 +19,15 @@ use Digest::SHA;
use 5.009003;
# Version number to make Perl::Critic happy
-our $VERSION = 2.0;
+our $VERSION = 2.1;
# If no arguments, work with the current working directory
croak('Need at least one dir to search') if !@ARGV;
-# Start an array of filenames...
-my @fns;
+# Start a hash of filesizes to file names/stats...
+my %sizes;
-# ... and fill it up with File::Find.
+# ...and fill it up with File::Find.
find {
no_chdir => 1,
wanted => sub {
@@ -35,49 +35,44 @@ find {
## no critic (ProhibitFiletest_f)
return if !-f $fn;
return if -l $fn;
- push @fns, $fn;
+ my $st = stat $fn or return;
+ return if !$st->size;
+ push @{ $sizes{ $st->size } },
+ {
+ fn => $fn,
+ st => $st,
+ };
return;
},
}, @ARGV;
-# Start a table of device and inode to detect hard links, to completely skip
-# any files we've already seen
-my %inodes;
-
-# Build a table of filesizes to list of filenames with that size
-my %sizes;
-FN: for my $fn (@fns) {
-
- # Get file metadata
- my $st = stat $fn or next FN;
+# If there's more than one filename of any of the sizes, look for hard links,
+# checksum them if not linked, and push them into a sums table
+my ( %sums, $dig );
+FS: for my $fs ( grep { @{$_} > 1 } values %sizes ) {
- # Skip the file if it's empty
- next FN if !$st->size;
+ # Keep a temporary table of inodes to catch hard links
+ my %inos;
- # Try to get a device ID and inode, so we can ignore hard-linked files
- # using the inodes lookup table if we've seen that tuple before
- if ( $st->dev && $st->ino ) {
- next FN if exists $inodes{ $st->dev }{ $st->ino };
- $inodes{ $st->dev }{ $st->ino } = 1;
- }
+ # Iterate through each file in the list
+ F: for my $f ( @{$fs} ) {
- # Add the filename to its size bucket
- push @{ $sizes{ $st->size } }, $fn;
-}
+ # Catch hard links on compliant systems by keeping a dev/inode hash
+ my ( $dev, $ino ) = ( $f->{st}->dev, $f->{st}->ino );
+ if ( $dev && $ino ) {
+ next F if exists $inos{$dev}{$ino};
+ $inos{$dev}{$ino} = $f;
+ }
-# If there's more than one filename of any of the sizes, checksum them and push
-# them into a sums table
-my ( %sums, $dig );
-SIZE: for my $fns ( grep { @{$_} > 1 } values %sizes ) {
- for my $fn ( @{$fns} ) {
+ # Files still the same size and not hard linked, group by digest
$dig //= Digest::SHA->new('sha256');
- $dig->addfile($fn);
- push @{ $sums{ $dig->digest() } }, $fn;
+ $dig->addfile( $f->{fn} );
+ push @{ $sums{ $dig->digest() } }, $f;
}
}
# Print the groups of matched files (more than one share a checksum in the
# final table)
for my $group ( grep { @{$_} > 1 } values %sums ) {
- printf "%s\n\n", join "\n", @{$group};
+ printf "%s\n\n", join "\n", map { $_->{fn} } @{$group};
}