Platon Technologies
neprihlásený Prihlásiť Registrácia
SlovakEnglish
open source software development oslavujeme 10 rokov vývoja otvoreného softvéru! Štvrtok, 28. marec 2024

Súbor: [Platon] / scripts / perl / filesystem / hardlink-files.pl (stiahnutie)

Revízia 1.5, Sun Jul 10 22:17:13 2005 UTC (18 years, 8 months ago) by rajo

Zmeny od 1.4: +156 -21 [lines]

* Calculated MD5 sums of files are cached and used, if they are valid.
* Print elapsed time.
* Added usage information.

#!/usr/bin/perl -w

#
# hardlink-files.pl - create hardlinks from duplicate files
#
# Keywords: filesystem cleanup, create hardlinks, MD5 sum, file comparison, Perl cache
#
# Tested on Linux and FreeBSD.
#
# Developed by Lubomir Host 'rajo' <rajo AT platon.sk>
# Copyright (c) 2005 Platon SDG, http://platon.sk/
# Licensed under terms of GNU General Public License.
# All rights reserved.
#
# Changelog:
# 2005-04-17 - created
#
# Most usefull feature: create cache files with calculated MD5
#                       for speed optimalization
# 
#

# $Platon: scripts/perl/filesystem/hardlink-files.pl,v 1.4 2005/05/24 15:43:00 rajo Exp $

use strict;

use Quota;
use File::Find;
use Cwd qw( abs_path );
use File::Basename qw( basename fileparse );
use Digest::MD5;
#use Digest::SHA1;
use Data::Dumper;
use Time::HiRes qw(gettimeofday tv_interval); 

$| = 1;

use vars qw (
    $start
    $cache_file
    *file *dir *prune
    $md5sums $cache $stat $stat_global $mountpoints
    @directories
);

unless (scalar(@ARGV)) {
    print "Usage: $0 dir1 [ dir2 [ dir3 ] ]\n";
    exit 1;
}

$cache_file = ".hardlink-cache";
$start = [gettimeofday];

*file   = *File::Find::name;
*dir    = *File::Find::dir;
*prune  = *File::Find::prune;

sub find_directories()
{ # {{{
    if (-d $file) {
        my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file);

        my $path = abs_path($file);
        $path =~ s#/$##g;
        push @directories, $path;
    }
} # }}}

sub make_md5sum()
{ # {{{
    if (-f $file) {
        my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file);

        my $abs_file = abs_path($file);
        my ($filename, $path, $suffix) = fileparse($abs_file, qr{\.[^.]+$});
        $filename .= $suffix;
        return if ($filename eq $cache_file); # ignore .hardlink-cache files
        $path =~ s#/$##g;

        my $cache_exists = 0;
        if ( # {{{
            defined($cache->{$path})
                and defined($cache->{$path}->{$filename})) {
            $cache_exists = 1;
        } # }}}

        my $cache_valid = 0;
        if ($cache_exists) { # {{{
            if (        $cache->{$path}->{$filename}->{size}    == $size
                    and    $cache->{$path}->{$filename}->{ino}        == $ino
                    and    $cache->{$path}->{$filename}->{mtime}    == $mtime
            ) {
                $cache_valid = 1;
            }
            else {
                print "File '$abs_file' modified, updating cache...\n";
            }
        } # }}}

        my $md5;
        if ($cache_valid) {
            $md5 = $cache->{$path}->{$filename}->{md5};
        }
        else {
            if (open(FILE, $file)) {
                binmode(FILE);
                $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
                close(FILE);
            }
            else {
                print STDERR "Can't open file '$file': $!";
            }
        }
        push @{$md5sums->{$dev}->{$md5}}, { # {{{
            filename    => $abs_file,
            #dir            => $dir,
            dev            => $dev,
            ino            => $ino,
            #mode        => $mode,
            #nlink        => $nlink,
            #uid            => $uid,
            #gid            => $gid,
            rdev        => $rdev,
            size        => $size,
            atime        => $atime,
            mtime        => $mtime,
            ctime        => $ctime,
            blksize        => $blksize,
            blocks        => $blocks,
        }; # }}}
    }
} # }}}

find({
        wanted        => \&find_directories,
        bydepth        => 0,
        no_chdir    => 1,
    }, @ARGV);

#
# Load cache
# TODO:
foreach my $xdir (@directories) { # {{{
    my $file = "$xdir/$cache_file";
    if (-f $file) {
        print "Trying to load cache file '$file' ... ";
        my $code;
        if (open(FILE, $file)) {
            while(<FILE>) {
                $code .= $_;
            }
            close(FILE);
            my $hardlink_cache = undef;
            eval $code;
            if ($@) {
                print "failed: $@; ... removing\n";
                unlink $file;
            }
            if (defined $hardlink_cache and ref($hardlink_cache) eq 'HASH') {
                $cache->{$xdir} = $hardlink_cache;
                print "OK\n"
            }
            else {
                print "failed: bad structure; ... removing\n";
                unlink $file;
            }
        }
        else {
            print STDERR "Can't open cache file '$file': $!";
        }
    }
} # }}}

find({
        wanted        => \&make_md5sum,
        bydepth        => 0,
        no_chdir    => 1,
    }, @ARGV);

#print Dumper($md5sums);

#
# Regenerate cache
#
undef $cache;
foreach my $dev (keys %$md5sums) { # {{{
    my $md5list = $md5sums->{$dev};
    while (my ($md5, $inf) = each %$md5list) {
        foreach my $hash (@$inf) {
            my ($filename, $path, $suffix) = fileparse($hash->{filename}, qr{\.[^.]+$});
            $filename .= $suffix;
            $path =~ s#/$##g;
            $cache->{$path}->{$filename} = $hash;
            $cache->{$path}->{$filename}->{md5} = $md5;
        }
    }
} # }}}
#print "CACHE: " . Dumper($cache);

#
# Save cache
#
foreach my $xdir (@directories) { # {{{
    my $file = "$xdir/$cache_file";

    next unless (defined($cache->{$xdir}));

    print "Saving cache file '$file' ... ";
    if (open(FILE, ">$file")) {
        #print Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']);
        print FILE Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']);
        close(FILE);
        print "OK\n";
    }
    else {
        print "failed!\n";
        print STDERR "Can't open cache file '$file': $!";
    }
} # }}}

#
# Statistics
# {{{
my $hashes;
foreach my $dev (keys %$md5sums) {
    foreach my $key (keys %{$md5sums->{$dev}}) {
        $hashes->{$key} = 1;
    }
    $stat->{$dev} = {
        count    => 0,
        size    => 0,
        blksize    => 0,
        blocks    => 0,
    };
}
$stat_global->{unique_files} = scalar(keys %$hashes);
# }}}

#
# Get mountpoints info
# {{{
Quota::setmntent();
while (my ($mountdev, $mountpoint, $type, $opts) = Quota::getmntent()) {
    my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($mountpoint);
    $mountpoints->{$dev} = {
        mountpoint    => $mountpoint,
        mountdev    => $mountdev,
    } if (defined $dev);
}
Quota::endmntent();
# }}}

#
# Do the real work: hardlink files and compute stats
#
foreach my $dev (keys %$md5sums) {
    foreach my $key (keys %{$md5sums->{$dev}}) {
        my @arr = @{$md5sums->{$dev}->{$key}};
        my $count = scalar(@arr);
        if ($count > 1) {
            my $source    = $arr[0]->{filename};
            my $inum    = $arr[0]->{ino};

            # statistics report # {{{
            $stat->{$dev}->{count}        += ($count - 1);
            $stat->{$dev}->{size}        += ($count - 1) * $arr[0]->{size};
            $stat->{$dev}->{blocks}        += ($count - 1) * $arr[0]->{blocks};
            $stat->{$dev}->{blksize}     = $arr[0]->{blksize};
            # }}}

            print $source;
            for (my $i = 1; $i < $count; $i++) { # hardlink files {{{
                print "\n\t<-- ", $arr[$i]->{filename};
                if ($arr[$i]->{ino} == $inum) {
                    print " [already linked]";
                }
                else {
                    unlink $arr[$i]->{filename} or warn "\n\nunlink error: $!\n";
                    link $source, $arr[$i]->{filename} or warn "\n\nlink error: $!\n";
                }
            } # }}}
            print "\n\n";
        }
    }
}

#print Dumper($stat, $mountpoints);
#print Dumper($md5sums);

#
# Print statistics
# {{{
foreach my $dev (keys %$md5sums) {
    my $info = $stat->{$dev};
    print '-' x 60, "\n";
    if (defined $mountpoints->{$dev}) {
        print "$mountpoints->{$dev}->{mountpoint} [$mountpoints->{$dev}->{mountdev}]\n";
    }
    else {
        print "??? [$dev]\n";
    }
    print "         Unique files: ", scalar(keys %{$md5sums->{$dev}}), "\n";
    print "     Hardlinked files: $info->{count}\n";
    print "          Saved bytes: $info->{size}\n";
    print "         Saved blocks: $info->{blocks}\n";
    print "Saved bytes in blocks: ", $info->{blksize} * $info->{blocks}, "\n";
}
print '-' x 60, "\n";
print "TOTAL:\n";
print "   Total unique files: $stat_global->{unique_files}\n";
print "   Elapsed time: ", tv_interval($start, [gettimeofday]), " seconds\n";
# }}}

# vim: ts=4
# vim600: fdm=marker fdl=0 fdc=3


Platon Group <platon@platon.sk> http://platon.sk/
Copyright © 2002-2006 Platon Group
Stránka používa redakčný systém Metafox
Na začiatok