verzia 1.4, 2005/05/24 15:43:00 |
verzia 1.5, 2005/07/10 22:17:13 |
|
|
# |
# |
# hardlink-files.pl - create hardlinks from duplicate files |
# hardlink-files.pl - create hardlinks from duplicate files |
# |
# |
|
# Keywords: filesystem cleanup, create hardlinks, MD5 sum, file comparison, Perl cache |
|
# |
# Tested on Linux and FreeBSD. |
# Tested on Linux and FreeBSD. |
# |
# |
# Developed by Lubomir Host 'rajo' <rajo AT platon.sk> |
# Developed by Lubomir Host 'rajo' <rajo AT platon.sk> |
|
|
# Changelog: |
# Changelog: |
# 2005-04-17 - created |
# 2005-04-17 - created |
# |
# |
|
# Most usefull feature: create cache files with calculated MD5 |
|
# for speed optimalization |
|
# |
|
# |
|
|
# $Platon: scripts/perl/filesystem/hardlink-files.pl,v 1.3 2005/05/24 12:31:28 rajo Exp $ |
# $Platon: scripts/perl/filesystem/hardlink-files.pl,v 1.4 2005/05/24 15:43:00 rajo Exp $ |
|
|
use strict; |
use strict; |
|
|
use Quota; |
use Quota; |
use File::Find; |
use File::Find; |
use Cwd qw( abs_path ); |
use Cwd qw( abs_path ); |
|
use File::Basename qw( basename fileparse ); |
use Digest::MD5; |
use Digest::MD5; |
#use Digest::SHA1; |
#use Digest::SHA1; |
use Data::Dumper; |
use Data::Dumper; |
|
use Time::HiRes qw(gettimeofday tv_interval); |
|
|
$| = 1; |
$| = 1; |
|
|
use vars qw ( |
use vars qw ( |
|
$start |
|
$cache_file |
*file *dir *prune |
*file *dir *prune |
$md5sums $cache $stat $stat_global $mountpoints |
$md5sums $cache $stat $stat_global $mountpoints |
|
@directories |
); |
); |
|
|
|
unless (scalar(@ARGV)) { |
|
print "Usage: $0 dir1 [ dir2 [ dir3 ] ]\n"; |
|
exit 1; |
|
} |
|
|
|
$cache_file = ".hardlink-cache"; |
|
$start = [gettimeofday]; |
|
|
*file = *File::Find::name; |
*file = *File::Find::name; |
*dir = *File::Find::dir; |
*dir = *File::Find::dir; |
*prune = *File::Find::prune; |
*prune = *File::Find::prune; |
|
|
sub wanted() |
sub find_directories() |
{ # {{{ |
{ # {{{ |
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks); |
if (-d $file) { |
|
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file); |
|
|
|
my $path = abs_path($file); |
|
$path =~ s#/$##g; |
|
push @directories, $path; |
|
} |
|
} # }}} |
|
|
|
sub make_md5sum() |
|
{ # {{{ |
if (-f $file) { |
if (-f $file) { |
($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file); |
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file); |
|
|
open(FILE, $file) or die "Can't open file '$file': $!"; |
my $abs_file = abs_path($file); |
binmode(FILE); |
my ($filename, $path, $suffix) = fileparse($abs_file, qr{\.[^.]+$}); |
my $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest; |
$filename .= $suffix; |
push @{$md5sums->{$dev}->{$md5}}, { |
return if ($filename eq $cache_file); # ignore .hardlink-cache files |
filename => abs_path($file), |
$path =~ s#/$##g; |
dir => $dir, |
|
|
my $cache_exists = 0; |
|
if ( # {{{ |
|
defined($cache->{$path}) |
|
and defined($cache->{$path}->{$filename})) { |
|
$cache_exists = 1; |
|
} # }}} |
|
|
|
my $cache_valid = 0; |
|
if ($cache_exists) { # {{{ |
|
if ( $cache->{$path}->{$filename}->{size} == $size |
|
and $cache->{$path}->{$filename}->{ino} == $ino |
|
and $cache->{$path}->{$filename}->{mtime} == $mtime |
|
) { |
|
$cache_valid = 1; |
|
} |
|
else { |
|
print "File '$abs_file' modified, updating cache...\n"; |
|
} |
|
} # }}} |
|
|
|
my $md5; |
|
if ($cache_valid) { |
|
$md5 = $cache->{$path}->{$filename}->{md5}; |
|
} |
|
else { |
|
if (open(FILE, $file)) { |
|
binmode(FILE); |
|
$md5 = Digest::MD5->new->addfile(*FILE)->hexdigest; |
|
close(FILE); |
|
} |
|
else { |
|
print STDERR "Can't open file '$file': $!"; |
|
} |
|
} |
|
push @{$md5sums->{$dev}->{$md5}}, { # {{{ |
|
filename => $abs_file, |
|
#dir => $dir, |
dev => $dev, |
dev => $dev, |
ino => $ino, |
ino => $ino, |
mode => $mode, |
#mode => $mode, |
nlink => $nlink, |
#nlink => $nlink, |
uid => $uid, |
#uid => $uid, |
gid => $gid, |
#gid => $gid, |
rdev => $rdev, |
rdev => $rdev, |
size => $size, |
size => $size, |
atime => $atime, |
atime => $atime, |
|
|
ctime => $ctime, |
ctime => $ctime, |
blksize => $blksize, |
blksize => $blksize, |
blocks => $blocks, |
blocks => $blocks, |
}; |
}; # }}} |
close(FILE); |
|
} |
} |
} # }}} |
} # }}} |
|
|
|
find({ |
|
wanted => \&find_directories, |
|
bydepth => 0, |
|
no_chdir => 1, |
|
}, @ARGV); |
|
|
# |
# |
# Load cache |
# Load cache |
# TODO: |
# TODO: |
foreach my $xdir (@ARGV) { |
foreach my $xdir (@directories) { # {{{ |
if (-f "$xdir/.hardlink-cache") { |
my $file = "$xdir/$cache_file"; |
|
if (-f $file) { |
|
print "Trying to load cache file '$file' ... "; |
|
my $code; |
|
if (open(FILE, $file)) { |
|
while(<FILE>) { |
|
$code .= $_; |
|
} |
|
close(FILE); |
|
my $hardlink_cache = undef; |
|
eval $code; |
|
if ($@) { |
|
print "failed: $@; ... removing\n"; |
|
unlink $file; |
|
} |
|
if (defined $hardlink_cache and ref($hardlink_cache) eq 'HASH') { |
|
$cache->{$xdir} = $hardlink_cache; |
|
print "OK\n" |
|
} |
|
else { |
|
print "failed: bad structure; ... removing\n"; |
|
unlink $file; |
|
} |
|
} |
|
else { |
|
print STDERR "Can't open cache file '$file': $!"; |
|
} |
} |
} |
} |
} # }}} |
|
|
find({ |
find({ |
wanted => \&wanted, |
wanted => \&make_md5sum, |
bydepth => 0, |
bydepth => 0, |
no_chdir => 1, |
no_chdir => 1, |
}, @ARGV); |
}, @ARGV); |
|
|
#print Dumper($md5sums); |
#print Dumper($md5sums); |
|
|
# |
# |
|
# Regenerate cache |
|
# |
|
undef $cache; |
|
foreach my $dev (keys %$md5sums) { # {{{ |
|
my $md5list = $md5sums->{$dev}; |
|
while (my ($md5, $inf) = each %$md5list) { |
|
foreach my $hash (@$inf) { |
|
my ($filename, $path, $suffix) = fileparse($hash->{filename}, qr{\.[^.]+$}); |
|
$filename .= $suffix; |
|
$path =~ s#/$##g; |
|
$cache->{$path}->{$filename} = $hash; |
|
$cache->{$path}->{$filename}->{md5} = $md5; |
|
} |
|
} |
|
} # }}} |
|
#print "CACHE: " . Dumper($cache); |
|
|
|
# |
|
# Save cache |
|
# |
|
foreach my $xdir (@directories) { # {{{ |
|
my $file = "$xdir/$cache_file"; |
|
|
|
next unless (defined($cache->{$xdir})); |
|
|
|
print "Saving cache file '$file' ... "; |
|
if (open(FILE, ">$file")) { |
|
#print Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']); |
|
print FILE Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']); |
|
close(FILE); |
|
print "OK\n"; |
|
} |
|
else { |
|
print "failed!\n"; |
|
print STDERR "Can't open cache file '$file': $!"; |
|
} |
|
} # }}} |
|
|
|
# |
# Statistics |
# Statistics |
# {{{ |
# {{{ |
my $hashes; |
my $hashes; |
Riadok 174 foreach my $dev (keys %$md5sums) { |
|
Riadok 308 foreach my $dev (keys %$md5sums) { |
|
print '-' x 60, "\n"; |
print '-' x 60, "\n"; |
print "TOTAL:\n"; |
print "TOTAL:\n"; |
print " Total unique files: $stat_global->{unique_files}\n"; |
print " Total unique files: $stat_global->{unique_files}\n"; |
|
print " Elapsed time: ", tv_interval($start, [gettimeofday]), " seconds\n"; |
# }}} |
# }}} |
|
|
# vim: ts=4 |
# vim: ts=4 |