#!/usr/bin/env perl

#
# checkem: Find groups of duplicate files with core libraries.
#
# Author: Tom Ryder <tom@sanctum.geek.nz>
# Site: <https://sanctum.geek.nz/cgit/checkem.git>
#

# Package name
package File::Duplicates::Checkem;

# Force me to write this properly
use strict;
use warnings;
use utf8;

# Tolerate very old Perls
use 5.006;

# Import modules; Digest is the only one that wasn't in Perl 5.6 core
use Carp;
use Fcntl ':mode';
use File::Find;
use Digest;

# Version number to make Perl::Critic happy
our $VERSION = 2.12;

# If no arguments, work with the current working directory
if ( !@ARGV ) {
    printf {*STDERR} "%s\n", 'Need at least one file or directory';
    exit 2;
}

# Convenience keys into stat() return array for clarity and to appease
# Perl::Critic
my %STATS = (
    dev  => 0,
    ino  => 1,
    mode => 2,
    size => 7,
);

# We need to pick and create a Digest object
my $dig;

# We were told which algorithm to use
if ( exists $ENV{CHECKEM_ALG} ) {
    $dig = Digest->new( $ENV{CHECKEM_ALG} );
}

# Try worse and worse algorithms until we get a digest object
else {
  ALG: for my $alg (qw(SHA-256 SHA-1 MD5)) {
        next ALG if not eval { $dig = Digest->new($alg); };
    }
}

# Still no digest object, give up
if ( !defined $dig ) {
    croak 'Could not create a useable Digest object';
}

# Start a hash of filesizes to file names/stats...
my %sizes;

# ...and fill it up with File::Find.
find {
    no_chdir => 1,
    wanted   => sub {

        # Start a hash to represent this file
        my %f = ( name => $File::Find::name, );

        # Pull in the file stat values we care about
        @f{ keys %STATS } = ( stat $f{name} )[ values %STATS ]
          or return;

        # Check it's a regular file
        return if not $f{mode} & S_IFREG;

        # Check its size is non-zero
        return if not $f{size};

        # Push the file hash into its size's bucket
        return push @{ $sizes{ $f{size} } }, \%f;
    },
}, @ARGV;

# If there's more than one filename of any of the sizes, look for hard links,
# checksum them if not linked, and push them into a sums table
my %sums;
SIZE: for my $fs ( grep { @{$_} > 1 } values %sizes ) {

    # Keep a temporary table of inodes to catch hard links
    my %inos;

    # Iterate through each file in the list
  FILE: for my $f ( @{$fs} ) {

        # Catch hard links on compliant systems by keeping a dev/inode hash
        my ( $dev, $ino ) = @{$f}{qw(dev ino)};
        if ( $dev && $ino ) {
            next if exists $inos{$dev}{$ino};
            $inos{$dev}{$ino} = $f;
        }

        # Files still the same size and not hard linked, group by digest;
        # create the digest object if it isn't already defined
        if ( open my $fh, '<', $f->{name} ) {
            binmode $fh;
            $dig->addfile($fh);
            push @{ $sums{ $dig->digest() } }, $f;
            close $fh
              or carp 'Failed to close file';
        }
        else {
            carp 'Failed to open file';
        }
    }
}

# Print the groups of matched files (more than one share a checksum in the
# final table); sort the blocks by the filesize, and the files within each
# block by name
GROUP:
for my $group (
    sort { $a->[0]{size} <=> $b->[0]{size} }
    grep { @{$_} > 1 } values %sums
  )
{
    printf "%s\n\n", join "\n", sort map { $_->{name} } @{$group};
}