checkem


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

#!/usr/bin/env perl

#
# checkem: Find groups of duplicate files with libraries that have been in Perl
# Core since version 5.9.3
#

# Package name
package File::Checkem;

# Force me to write this properly
use strict;
use warnings;
use utf8;

# Get the find and current working directory modules
use Carp;
use Fcntl ':mode';
use File::Find;
use Digest::SHA;

# Lowest version number that has all of those core modules; Digest::SHA is the
# newest
use 5.009003;

# Version number to make Perl::Critic happy
our $VERSION = 2.4;

# If no arguments, work with the current working directory
croak 'Need at least one dir to search' if !@ARGV;

# Convenience keys into stat() return array for clarity and to appease
# Perl::Critic
my %STATS = (
    dev  => 0,
    ino  => 1,
    mode => 2,
    size => 7,
);

# Figure out the SHA algorithm to use; defaults to sha256, but can be overriden
# by setting CHECKEM_ALG in the environment to e.g. "sha1", which is slightly
# faster but technically broken in practice since early 2017
my $alg = $ENV{CHECKEM_ALG} // 'sha256';

# Start a hash of filesizes to file names/stats...
my %sizes;

# ...and fill it up with File::Find.
find {
    no_chdir => 1,
    wanted   => sub {
        my $fn = $File::Find::name;

        # Keep only the stat values we will actually need
        my %st;
        @st{ keys %STATS } = ( stat $fn )[ values %STATS ]
          or return;

        # Check it's a regular file
        return if not $st{mode} & S_IFREG;

        # Check its size is non-zero
        return if not $st{size};

        # Push the filename and the stats into this size's bucket
        return push @{ $sizes{ $st{size} } },
          {
            fn => $fn,
            st => \%st,
          };
    },
}, @ARGV;

# If there's more than one filename of any of the sizes, look for hard links,
# checksum them if not linked, and push them into a sums table
my ( %sums, $dig );
for my $fs ( grep { @{$_} > 1 } values %sizes ) {

    # Keep a temporary table of inodes to catch hard links
    my %inos;

    # Iterate through each file in the list
    for my $f ( @{$fs} ) {

        # Catch hard links on compliant systems by keeping a dev/inode hash
        my ( $dev, $ino ) = @{ $f->{st} }{qw(dev ino)};
        if ( $dev && $ino ) {
            next if exists $inos{$dev}{$ino};
            $inos{$dev}{$ino} = $f;
        }

        # Files still the same size and not hard linked, group by digest;
        # create the digest object if it isn't already defined
        ( $dig //= Digest::SHA->new($alg) )->addfile( $f->{fn} );
        push @{ $sums{ $dig->digest() } }, $f;
    }
}

# Print the groups of matched files (more than one share a checksum in the
# final table)
for my $group ( grep { @{$_} > 1 } values %sums ) {
    printf "%s\n\n", join "\n", map { $_->{fn} } @{$group};
}