checkem


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

#!/usr/bin/env perl

#
# checkem: Find groups of duplicate files with core libraries.
#
# Author: Tom Ryder <tom@sanctum.geek.nz>
# Site: <https://sanctum.geek.nz/cgit/checkem.git>
#

# Package name
package File::Duplicates::Checkem;

# Force me to write this properly
use strict;
use warnings;
use utf8;

# Tolerate very old Perls
use 5.006;

# Import modules; Digest is the only one that wasn't in Perl 5.6 core
use Carp;
use Fcntl ':mode';
use File::Find;
use Digest;

# Version number to make Perl::Critic happy
our $VERSION = 2.12;

# If no arguments, work with the current working directory
if ( !@ARGV ) {
    printf {*STDERR} "%s\n", 'Need at least one file or directory';
    exit 2;
}

# Convenience keys into stat() return array for clarity and to appease
# Perl::Critic
my %STATS = (
    dev  => 0,
    ino  => 1,
    mode => 2,
    size => 7,
);

# We need to pick and create a Digest object
my $dig;

# We were told which algorithm to use
if ( exists $ENV{CHECKEM_ALG} ) {
    $dig = Digest->new( $ENV{CHECKEM_ALG} );
}

# Try worse and worse algorithms until we get a digest object
else {
  ALG: for my $alg (qw(SHA-256 SHA-1 MD5)) {
        next ALG if not eval { $dig = Digest->new($alg); };
    }
}

# Still no digest object, give up
if ( !defined $dig ) {
    croak 'Could not create a useable Digest object';
}

# Start a hash of filesizes to file names/stats...
my %sizes;

# ...and fill it up with File::Find.
find {
    no_chdir => 1,
    wanted   => sub {

        # Start a hash to represent this file
        my %f = ( name => $File::Find::name, );

        # Pull in the file stat values we care about
        @f{ keys %STATS } = ( stat $f{name} )[ values %STATS ]
          or return;

        # Check it's a regular file
        return if not $f{mode} & S_IFREG;

        # Check its size is non-zero
        return if not $f{size};

        # Push the file hash into its size's bucket
        return push @{ $sizes{ $f{size} } }, \%f;
    },
}, @ARGV;

# If there's more than one filename of any of the sizes, look for hard links,
# checksum them if not linked, and push them into a sums table
my %sums;
SIZE: for my $fs ( grep { @{$_} > 1 } values %sizes ) {

    # Keep a temporary table of inodes to catch hard links
    my %inos;

    # Iterate through each file in the list
  FILE: for my $f ( @{$fs} ) {

        # Catch hard links on compliant systems by keeping a dev/inode hash
        my ( $dev, $ino ) = @{$f}{qw(dev ino)};
        if ( $dev && $ino ) {
            next if exists $inos{$dev}{$ino};
            $inos{$dev}{$ino} = $f;
        }

        # Files still the same size and not hard linked, group by digest
        if ( open my $fh, '<', $f->{name} ) {
            binmode $fh;
            $dig->addfile($fh);
            push @{ $sums{ $dig->digest() } }, $f;
            close $fh
              or carp 'Failed to close file';
        }
        else {
            carp 'Failed to open file';
        }
    }
}

# Print the groups of matched files (more than one share a checksum in the
# final table); sort the blocks by the filesize, and the files within each
# block by name
GROUP:
for my $group (
    sort { $a->[0]{size} <=> $b->[0]{size} }
    grep { @{$_} > 1 } values %sums
  )
{
    printf "%s\n\n", join "\n", sort map { $_->{name} } @{$group};
}