Súbor: [Platon] / scripts / perl / filesystem / hardlink-files.pl (stiahnutie)
Revízia 1.5, Sun Jul 10 22:17:13 2005 UTC (18 years, 8 months ago) by rajo
Zmeny od 1.4: +156 -21
[lines]
* Calculated MD5 sums of files are cached and used, if they are valid.
* Print elapsed time.
* Added usage information.
|
#!/usr/bin/perl -w
#
# hardlink-files.pl - create hardlinks from duplicate files
#
# Keywords: filesystem cleanup, create hardlinks, MD5 sum, file comparison, Perl cache
#
# Tested on Linux and FreeBSD.
#
# Developed by Lubomir Host 'rajo' <rajo AT platon.sk>
# Copyright (c) 2005 Platon SDG, http://platon.sk/
# Licensed under terms of GNU General Public License.
# All rights reserved.
#
# Changelog:
# 2005-04-17 - created
#
# Most usefull feature: create cache files with calculated MD5
# for speed optimalization
#
#
# $Platon: scripts/perl/filesystem/hardlink-files.pl,v 1.4 2005/05/24 15:43:00 rajo Exp $
use strict;
use Quota;
use File::Find;
use Cwd qw( abs_path );
use File::Basename qw( basename fileparse );
use Digest::MD5;
#use Digest::SHA1;
use Data::Dumper;
use Time::HiRes qw(gettimeofday tv_interval);
$| = 1;
use vars qw (
$start
$cache_file
*file *dir *prune
$md5sums $cache $stat $stat_global $mountpoints
@directories
);
unless (scalar(@ARGV)) {
print "Usage: $0 dir1 [ dir2 [ dir3 ] ]\n";
exit 1;
}
$cache_file = ".hardlink-cache";
$start = [gettimeofday];
*file = *File::Find::name;
*dir = *File::Find::dir;
*prune = *File::Find::prune;
sub find_directories()
{ # {{{
if (-d $file) {
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file);
my $path = abs_path($file);
$path =~ s#/$##g;
push @directories, $path;
}
} # }}}
sub make_md5sum()
{ # {{{
if (-f $file) {
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file);
my $abs_file = abs_path($file);
my ($filename, $path, $suffix) = fileparse($abs_file, qr{\.[^.]+$});
$filename .= $suffix;
return if ($filename eq $cache_file); # ignore .hardlink-cache files
$path =~ s#/$##g;
my $cache_exists = 0;
if ( # {{{
defined($cache->{$path})
and defined($cache->{$path}->{$filename})) {
$cache_exists = 1;
} # }}}
my $cache_valid = 0;
if ($cache_exists) { # {{{
if ( $cache->{$path}->{$filename}->{size} == $size
and $cache->{$path}->{$filename}->{ino} == $ino
and $cache->{$path}->{$filename}->{mtime} == $mtime
) {
$cache_valid = 1;
}
else {
print "File '$abs_file' modified, updating cache...\n";
}
} # }}}
my $md5;
if ($cache_valid) {
$md5 = $cache->{$path}->{$filename}->{md5};
}
else {
if (open(FILE, $file)) {
binmode(FILE);
$md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
close(FILE);
}
else {
print STDERR "Can't open file '$file': $!";
}
}
push @{$md5sums->{$dev}->{$md5}}, { # {{{
filename => $abs_file,
#dir => $dir,
dev => $dev,
ino => $ino,
#mode => $mode,
#nlink => $nlink,
#uid => $uid,
#gid => $gid,
rdev => $rdev,
size => $size,
atime => $atime,
mtime => $mtime,
ctime => $ctime,
blksize => $blksize,
blocks => $blocks,
}; # }}}
}
} # }}}
find({
wanted => \&find_directories,
bydepth => 0,
no_chdir => 1,
}, @ARGV);
#
# Load cache
# TODO:
foreach my $xdir (@directories) { # {{{
my $file = "$xdir/$cache_file";
if (-f $file) {
print "Trying to load cache file '$file' ... ";
my $code;
if (open(FILE, $file)) {
while(<FILE>) {
$code .= $_;
}
close(FILE);
my $hardlink_cache = undef;
eval $code;
if ($@) {
print "failed: $@; ... removing\n";
unlink $file;
}
if (defined $hardlink_cache and ref($hardlink_cache) eq 'HASH') {
$cache->{$xdir} = $hardlink_cache;
print "OK\n"
}
else {
print "failed: bad structure; ... removing\n";
unlink $file;
}
}
else {
print STDERR "Can't open cache file '$file': $!";
}
}
} # }}}
find({
wanted => \&make_md5sum,
bydepth => 0,
no_chdir => 1,
}, @ARGV);
#print Dumper($md5sums);
#
# Regenerate cache
#
undef $cache;
foreach my $dev (keys %$md5sums) { # {{{
my $md5list = $md5sums->{$dev};
while (my ($md5, $inf) = each %$md5list) {
foreach my $hash (@$inf) {
my ($filename, $path, $suffix) = fileparse($hash->{filename}, qr{\.[^.]+$});
$filename .= $suffix;
$path =~ s#/$##g;
$cache->{$path}->{$filename} = $hash;
$cache->{$path}->{$filename}->{md5} = $md5;
}
}
} # }}}
#print "CACHE: " . Dumper($cache);
#
# Save cache
#
foreach my $xdir (@directories) { # {{{
my $file = "$xdir/$cache_file";
next unless (defined($cache->{$xdir}));
print "Saving cache file '$file' ... ";
if (open(FILE, ">$file")) {
#print Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']);
print FILE Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']);
close(FILE);
print "OK\n";
}
else {
print "failed!\n";
print STDERR "Can't open cache file '$file': $!";
}
} # }}}
#
# Statistics
# {{{
my $hashes;
foreach my $dev (keys %$md5sums) {
foreach my $key (keys %{$md5sums->{$dev}}) {
$hashes->{$key} = 1;
}
$stat->{$dev} = {
count => 0,
size => 0,
blksize => 0,
blocks => 0,
};
}
$stat_global->{unique_files} = scalar(keys %$hashes);
# }}}
#
# Get mountpoints info
# {{{
Quota::setmntent();
while (my ($mountdev, $mountpoint, $type, $opts) = Quota::getmntent()) {
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($mountpoint);
$mountpoints->{$dev} = {
mountpoint => $mountpoint,
mountdev => $mountdev,
} if (defined $dev);
}
Quota::endmntent();
# }}}
#
# Do the real work: hardlink files and compute stats
#
foreach my $dev (keys %$md5sums) {
foreach my $key (keys %{$md5sums->{$dev}}) {
my @arr = @{$md5sums->{$dev}->{$key}};
my $count = scalar(@arr);
if ($count > 1) {
my $source = $arr[0]->{filename};
my $inum = $arr[0]->{ino};
# statistics report # {{{
$stat->{$dev}->{count} += ($count - 1);
$stat->{$dev}->{size} += ($count - 1) * $arr[0]->{size};
$stat->{$dev}->{blocks} += ($count - 1) * $arr[0]->{blocks};
$stat->{$dev}->{blksize} = $arr[0]->{blksize};
# }}}
print $source;
for (my $i = 1; $i < $count; $i++) { # hardlink files {{{
print "\n\t<-- ", $arr[$i]->{filename};
if ($arr[$i]->{ino} == $inum) {
print " [already linked]";
}
else {
unlink $arr[$i]->{filename} or warn "\n\nunlink error: $!\n";
link $source, $arr[$i]->{filename} or warn "\n\nlink error: $!\n";
}
} # }}}
print "\n\n";
}
}
}
#print Dumper($stat, $mountpoints);
#print Dumper($md5sums);
#
# Print statistics
# {{{
foreach my $dev (keys %$md5sums) {
my $info = $stat->{$dev};
print '-' x 60, "\n";
if (defined $mountpoints->{$dev}) {
print "$mountpoints->{$dev}->{mountpoint} [$mountpoints->{$dev}->{mountdev}]\n";
}
else {
print "??? [$dev]\n";
}
print " Unique files: ", scalar(keys %{$md5sums->{$dev}}), "\n";
print " Hardlinked files: $info->{count}\n";
print " Saved bytes: $info->{size}\n";
print " Saved blocks: $info->{blocks}\n";
print "Saved bytes in blocks: ", $info->{blksize} * $info->{blocks}, "\n";
}
print '-' x 60, "\n";
print "TOTAL:\n";
print " Total unique files: $stat_global->{unique_files}\n";
print " Elapsed time: ", tv_interval($start, [gettimeofday]), " seconds\n";
# }}}
# vim: ts=4
# vim600: fdm=marker fdl=0 fdc=3
Platon Group <platon@platon.sk> http://platon.sk/
|