Platon Technologies
neprihlásený Prihlásiť Registrácia
SlovakEnglish
open source software development oslavujeme 10 rokov vývoja otvoreného softvéru! Piatok, 29. marec 2024

Súbor: [Platon] / scripts / perl / filesystem / hardlink-files.pl (stiahnutie)

Revízia 1.7, Mon Jul 25 15:06:49 2005 UTC (18 years, 8 months ago) by rajo


Zmeny od 1.6: +75 -8 [lines]

Security fixes:
 - ignore not regular files
 - ignore files and directories writable by groups or others
 - ignore files and directories not owned by EUID
 - don't link files together, if permisions doesn't match

#!/usr/bin/perl -w

#
# hardlink-files.pl - create hardlinks from duplicate files
#
# Keywords: filesystem cleanup, create hardlinks, MD5 sum, file comparison, Perl cache
#
# Tested on Linux and FreeBSD.
#
# Developed by Lubomir Host 'rajo' <rajo AT platon.sk>
# Copyright (c) 2005 Platon SDG, http://platon.sk/
# Licensed under terms of GNU General Public License.
# All rights reserved.
#
# Changelog:
# 2005-04-17 - created
#
# Most usefull feature: create cache files with calculated MD5
#                       for speed optimalization
# 
#

# $Platon: scripts/perl/filesystem/hardlink-files.pl,v 1.6 2005/07/24 21:40:48 rajo Exp $

use strict;

use Quota;
use File::Find;
use Cwd qw( abs_path );
use File::Basename qw( basename fileparse );
use Digest::MD5;
#use Digest::SHA1;
#use Data::Dumper;
use XML::Dumper;
use Time::HiRes qw(gettimeofday tv_interval); 

$| = 1;

use vars qw (
    $start
    $cache_file
    *file *dir *prune
    $md5sums $cache $stat $stat_global $mountpoints
    @directories
    $dump
);

unless (scalar(@ARGV)) {
    print "Usage: $0 dir1 [ dir2 [ dir3 ] ]\n";
    exit 1;
}

$dump = new XML::Dumper;

$cache_file = ".hardlink-cache.xml";
$start = [gettimeofday];

*file   = *File::Find::name;
*dir    = *File::Find::dir;
*prune  = *File::Find::prune;

sub file_writable_by_others($$;)
{ # {{{
    my ($file, $mode) = @_;

    #print "file = '$file', mode = '$mode'";
    #use Data::Dumper;
    #print Dumper(caller, $file, $mode);

    return ($mode & 00022);
} # }}}

sub find_directories()
{ # {{{
    if (-d $file) {
        my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file);

        my $path = abs_path($file);
        $path =~ s#/$##g;
        push @directories, $path;
    }
} # }}}

sub make_md5sum()
{ # {{{
    if (-f $file) {
        my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file);

        #
        # security checks:
        #
        if ((-f _ ? ! -o _ : 0) || # owned by effective uid?
            -l _ || -d _ || -p _ || -b _ || -c _ || -S _ || -t _) {
            print STDERR "WARNING: File '$file' is not a regular file or not owned by EUID. Ignored for security reason.\n";
            return;
        }
        if (file_writable_by_others($file, $mode)) {
            print STDERR "WARNING: File '$file' is writable by others. Ignored for security reason.\n";
            return;
        }

        my $perm = sprintf("%lo", $mode & 07777), # perm in decimal format

        my $abs_file = abs_path($file);
        my ($filename, $path, $suffix) = fileparse($abs_file, qr{\.[^.]+$});
        $filename .= $suffix;
        return if ($filename eq $cache_file); # ignore .hardlink-cache files
        $path =~ s#/$##g;

        my $cache_exists = 0;
        if ( # {{{
            defined($cache->{$path})
                and defined($cache->{$path}->{$filename})) {
            $cache_exists = 1;
        } # }}}

        my $cache_valid = 0;
        if ($cache_exists) { # {{{
            if (        $cache->{$path}->{$filename}->{size}    == $size
                    and    $cache->{$path}->{$filename}->{ino}        == $ino
                    and    $cache->{$path}->{$filename}->{mtime}    == $mtime
                    and    $cache->{$path}->{$filename}->{perm}    == $perm
            ) {
                $cache_valid = 1;
            }
            else {
                print "File '$abs_file' modified (or permissions changed), updating cache...\n";
            }
        } # }}}

        my $md5;
        if ($cache_valid) {
            $md5 = $cache->{$path}->{$filename}->{md5};
        }
        else {
            if (open(FILE, $file)) {
                binmode(FILE);
                $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
                close(FILE);
            }
            else {
                print STDERR "Can't open file '$file': $!";
            }
        }
        push @{$md5sums->{$dev}->{$md5}}, { # {{{
            filename    => $abs_file,
            #dir            => $dir,
            dev            => $dev,
            ino            => $ino,
            perm        => $perm,
            #nlink        => $nlink,
            #uid            => $uid,
            #gid            => $gid,
            rdev        => $rdev,
            size        => $size,
            atime        => $atime,
            mtime        => $mtime,
            ctime        => $ctime,
            blksize        => $blksize,
            blocks        => $blocks,
        }; # }}}
    }
} # }}}

find({
        wanted        => \&find_directories,
        bydepth        => 0,
        no_chdir    => 1,
    }, @ARGV);

#
# Load cache
# TODO:
my @ok_directories;
foreach my $xdir (@directories) { # {{{
    my $file = "$xdir/$cache_file";

    if (file_writable_by_others($xdir, (lstat($xdir))[2] )) {
        print STDERR "WARNING: Cache dir '$xdir' is writable by others. Ignoring directory for security reason.\n";
        next;
    }

    my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file); # create special _ filehandle, see below

    #
    # security checks:
    #
    if ((-f _ ? ! -o _ : 0) || # owned by effective uid?
        -l _ || -d _ || -p _ || -b _ || -c _ || -S _ || -t _) {
        print STDERR "WARNING: Cache file '$file' is not a regular file or not owned by EUID. Ignored for security reason.\n";
        next;
    }
    
    if (-f _) {
        if (file_writable_by_others($file, $mode)) {
            print STDERR "WARNING: Cache file '$file' is writable by others. Ignored for security reason.\n";
            next;
        }

        print "Trying to load cache file '$file' ... ";
        my $hardlink_cache;
        eval { $hardlink_cache = $dump->xml2pl($file) };
        if ($@) {
            print STDERR "FAILED: $@\n";
            unlink $file or print STDERR "Can't remove cache file '$file', please remove them manualy\n";
        }
        elsif (defined $hardlink_cache and ref($hardlink_cache) eq 'HASH') {
            $cache->{$xdir} = $hardlink_cache;
            print "OK\n"
        }
        else {
            print STDERR "FAILED: bad data structure\n";
        }
        push @ok_directories, $xdir;
    }
} # }}}

# bad directories removed
@directories = @ok_directories;

find({
        wanted        => \&make_md5sum,
        bydepth        => 0,
        no_chdir    => 1,
    }, @ARGV);

#print Dumper($md5sums);

#
# Regenerate cache
#
undef $cache;
foreach my $dev (keys %$md5sums) { # {{{
    my $md5list = $md5sums->{$dev};
    while (my ($md5, $inf) = each %$md5list) {
        foreach my $hash (@$inf) {
            my ($filename, $path, $suffix) = fileparse($hash->{filename}, qr{\.[^.]+$});
            $filename .= $suffix;
            $path =~ s#/$##g;
            $cache->{$path}->{$filename} = $hash;
            $cache->{$path}->{$filename}->{md5} = $md5;
        }
    }
} # }}}
#print "CACHE: " . Dumper($cache);

#
# Save cache
#
foreach my $xdir (@directories) { # {{{
    my $file = "$xdir/$cache_file";

    next unless (defined($cache->{$xdir}));

    print "Saving cache file '$file' ... ";
    if (open(FILE, ">$file")) {
        #print Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']);
        # OLD INSECURE FORMAT OF CACHE
        #print FILE Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']);
        # more safer new format:
        print FILE $dump->pl2xml($cache->{$xdir});
        close(FILE);
        print "OK\n";
    }
    else {
        print "failed!\n";
        print STDERR "Can't open cache file '$file': $!";
    }
} # }}}

#
# Statistics
# {{{
my $hashes;
foreach my $dev (keys %$md5sums) {
    foreach my $key (keys %{$md5sums->{$dev}}) {
        $hashes->{$key} = 1;
    }
    $stat->{$dev} = {
        count    => 0,
        size    => 0,
        blksize    => 0,
        blocks    => 0,
    };
}
$stat_global->{unique_files} = scalar(keys %$hashes);
# }}}

#
# Get mountpoints info
# {{{
Quota::setmntent();
while (my ($mountdev, $mountpoint, $type, $opts) = Quota::getmntent()) {
    my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($mountpoint);
    $mountpoints->{$dev} = {
        mountpoint    => $mountpoint,
        mountdev    => $mountdev,
    } if (defined $dev);
}
Quota::endmntent();
# }}}

#
# Do the real work: hardlink files and compute stats
#
foreach my $dev (keys %$md5sums) {
    foreach my $key (keys %{$md5sums->{$dev}}) {
        my @arr = @{$md5sums->{$dev}->{$key}};
        my $count = scalar(@arr);
        if ($count > 1) {
            my $source    = $arr[0]->{filename};
            my $inum    = $arr[0]->{ino};

            # statistics report # {{{
            $stat->{$dev}->{count}        += ($count - 1);
            $stat->{$dev}->{size}        += ($count - 1) * $arr[0]->{size};
            $stat->{$dev}->{blocks}        += ($count - 1) * $arr[0]->{blocks};
            $stat->{$dev}->{blksize}     = $arr[0]->{blksize};
            # }}}

            print $source;
            for (my $i = 1; $i < $count; $i++) { # hardlink files {{{
                print "\n\t<-- ", $arr[$i]->{filename};
                if ($arr[$i]->{ino} == $inum) {
                    print " [already linked]";
                }
                else {
                    if ($arr[0]->{perm} ne $arr[$i]->{perm}) {
                        print STDERR "\n\nWARNING: file '$arr[$i]->{filename}' not linked to '$arr[0]->{filename}' due permission difference. Security reason.";
                    }
                    else {
                        unlink $arr[$i]->{filename}            or print STDERR "\n\nWARNING: nunlink error: $!\n";
                        link $source, $arr[$i]->{filename}    or print STDERR "\n\nWARNING: link error: $!\n";
                    }
                }
            } # }}}
            print "\n\n";
        }
    }
}

#print Dumper($stat, $mountpoints);
#print Dumper($md5sums);

#
# Print statistics
# {{{
foreach my $dev (keys %$md5sums) {
    my $info = $stat->{$dev};
    print '-' x 60, "\n";
    if (defined $mountpoints->{$dev}) {
        print "$mountpoints->{$dev}->{mountpoint} [$mountpoints->{$dev}->{mountdev}]\n";
    }
    else {
        print "??? [$dev]\n";
    }
    print "         Unique files: ", scalar(keys %{$md5sums->{$dev}}), "\n";
    print "     Hardlinked files: $info->{count}\n";
    print "          Saved bytes: $info->{size}\n";
    print "         Saved blocks: $info->{blocks}\n";
    print "Saved bytes in blocks: ", $info->{blksize} * $info->{blocks}, "\n";
}
print '-' x 60, "\n";
print "TOTAL:\n";
print "   Total unique files: $stat_global->{unique_files}\n";
print "   Elapsed time: ", tv_interval($start, [gettimeofday]), " seconds\n";
# }}}

# vim: ts=4
# vim600: fdm=marker fdl=0 fdc=3


Platon Group <platon@platon.sk> http://platon.sk/
Copyright © 2002-2006 Platon Group
Stránka používa redakčný systém Metafox
Na začiatok