Súbor: [Platon] / scripts / perl / filesystem / hardlink-files.pl (stiahnutie)
Revízia 1.7, Mon Jul 25 15:06:49 2005 UTC (18 years, 8 months ago) by rajo
Zmeny od 1.6: +75 -8
[lines]
Security fixes:
- ignore not regular files
- ignore files and directories writable by groups or others
- ignore files and directories not owned by EUID
- don't link files together, if permisions doesn't match
|
#!/usr/bin/perl -w
#
# hardlink-files.pl - create hardlinks from duplicate files
#
# Keywords: filesystem cleanup, create hardlinks, MD5 sum, file comparison, Perl cache
#
# Tested on Linux and FreeBSD.
#
# Developed by Lubomir Host 'rajo' <rajo AT platon.sk>
# Copyright (c) 2005 Platon SDG, http://platon.sk/
# Licensed under terms of GNU General Public License.
# All rights reserved.
#
# Changelog:
# 2005-04-17 - created
#
# Most usefull feature: create cache files with calculated MD5
# for speed optimalization
#
#
# $Platon: scripts/perl/filesystem/hardlink-files.pl,v 1.6 2005/07/24 21:40:48 rajo Exp $
use strict;
use Quota;
use File::Find;
use Cwd qw( abs_path );
use File::Basename qw( basename fileparse );
use Digest::MD5;
#use Digest::SHA1;
#use Data::Dumper;
use XML::Dumper;
use Time::HiRes qw(gettimeofday tv_interval);
$| = 1;
use vars qw (
$start
$cache_file
*file *dir *prune
$md5sums $cache $stat $stat_global $mountpoints
@directories
$dump
);
unless (scalar(@ARGV)) {
print "Usage: $0 dir1 [ dir2 [ dir3 ] ]\n";
exit 1;
}
$dump = new XML::Dumper;
$cache_file = ".hardlink-cache.xml";
$start = [gettimeofday];
*file = *File::Find::name;
*dir = *File::Find::dir;
*prune = *File::Find::prune;
sub file_writable_by_others($$;)
{ # {{{
my ($file, $mode) = @_;
#print "file = '$file', mode = '$mode'";
#use Data::Dumper;
#print Dumper(caller, $file, $mode);
return ($mode & 00022);
} # }}}
sub find_directories()
{ # {{{
if (-d $file) {
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file);
my $path = abs_path($file);
$path =~ s#/$##g;
push @directories, $path;
}
} # }}}
sub make_md5sum()
{ # {{{
if (-f $file) {
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file);
#
# security checks:
#
if ((-f _ ? ! -o _ : 0) || # owned by effective uid?
-l _ || -d _ || -p _ || -b _ || -c _ || -S _ || -t _) {
print STDERR "WARNING: File '$file' is not a regular file or not owned by EUID. Ignored for security reason.\n";
return;
}
if (file_writable_by_others($file, $mode)) {
print STDERR "WARNING: File '$file' is writable by others. Ignored for security reason.\n";
return;
}
my $perm = sprintf("%lo", $mode & 07777), # perm in decimal format
my $abs_file = abs_path($file);
my ($filename, $path, $suffix) = fileparse($abs_file, qr{\.[^.]+$});
$filename .= $suffix;
return if ($filename eq $cache_file); # ignore .hardlink-cache files
$path =~ s#/$##g;
my $cache_exists = 0;
if ( # {{{
defined($cache->{$path})
and defined($cache->{$path}->{$filename})) {
$cache_exists = 1;
} # }}}
my $cache_valid = 0;
if ($cache_exists) { # {{{
if ( $cache->{$path}->{$filename}->{size} == $size
and $cache->{$path}->{$filename}->{ino} == $ino
and $cache->{$path}->{$filename}->{mtime} == $mtime
and $cache->{$path}->{$filename}->{perm} == $perm
) {
$cache_valid = 1;
}
else {
print "File '$abs_file' modified (or permissions changed), updating cache...\n";
}
} # }}}
my $md5;
if ($cache_valid) {
$md5 = $cache->{$path}->{$filename}->{md5};
}
else {
if (open(FILE, $file)) {
binmode(FILE);
$md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
close(FILE);
}
else {
print STDERR "Can't open file '$file': $!";
}
}
push @{$md5sums->{$dev}->{$md5}}, { # {{{
filename => $abs_file,
#dir => $dir,
dev => $dev,
ino => $ino,
perm => $perm,
#nlink => $nlink,
#uid => $uid,
#gid => $gid,
rdev => $rdev,
size => $size,
atime => $atime,
mtime => $mtime,
ctime => $ctime,
blksize => $blksize,
blocks => $blocks,
}; # }}}
}
} # }}}
find({
wanted => \&find_directories,
bydepth => 0,
no_chdir => 1,
}, @ARGV);
#
# Load cache
# TODO:
my @ok_directories;
foreach my $xdir (@directories) { # {{{
my $file = "$xdir/$cache_file";
if (file_writable_by_others($xdir, (lstat($xdir))[2] )) {
print STDERR "WARNING: Cache dir '$xdir' is writable by others. Ignoring directory for security reason.\n";
next;
}
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file); # create special _ filehandle, see below
#
# security checks:
#
if ((-f _ ? ! -o _ : 0) || # owned by effective uid?
-l _ || -d _ || -p _ || -b _ || -c _ || -S _ || -t _) {
print STDERR "WARNING: Cache file '$file' is not a regular file or not owned by EUID. Ignored for security reason.\n";
next;
}
if (-f _) {
if (file_writable_by_others($file, $mode)) {
print STDERR "WARNING: Cache file '$file' is writable by others. Ignored for security reason.\n";
next;
}
print "Trying to load cache file '$file' ... ";
my $hardlink_cache;
eval { $hardlink_cache = $dump->xml2pl($file) };
if ($@) {
print STDERR "FAILED: $@\n";
unlink $file or print STDERR "Can't remove cache file '$file', please remove them manualy\n";
}
elsif (defined $hardlink_cache and ref($hardlink_cache) eq 'HASH') {
$cache->{$xdir} = $hardlink_cache;
print "OK\n"
}
else {
print STDERR "FAILED: bad data structure\n";
}
push @ok_directories, $xdir;
}
} # }}}
# bad directories removed
@directories = @ok_directories;
find({
wanted => \&make_md5sum,
bydepth => 0,
no_chdir => 1,
}, @ARGV);
#print Dumper($md5sums);
#
# Regenerate cache
#
undef $cache;
foreach my $dev (keys %$md5sums) { # {{{
my $md5list = $md5sums->{$dev};
while (my ($md5, $inf) = each %$md5list) {
foreach my $hash (@$inf) {
my ($filename, $path, $suffix) = fileparse($hash->{filename}, qr{\.[^.]+$});
$filename .= $suffix;
$path =~ s#/$##g;
$cache->{$path}->{$filename} = $hash;
$cache->{$path}->{$filename}->{md5} = $md5;
}
}
} # }}}
#print "CACHE: " . Dumper($cache);
#
# Save cache
#
foreach my $xdir (@directories) { # {{{
my $file = "$xdir/$cache_file";
next unless (defined($cache->{$xdir}));
print "Saving cache file '$file' ... ";
if (open(FILE, ">$file")) {
#print Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']);
# OLD INSECURE FORMAT OF CACHE
#print FILE Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']);
# more safer new format:
print FILE $dump->pl2xml($cache->{$xdir});
close(FILE);
print "OK\n";
}
else {
print "failed!\n";
print STDERR "Can't open cache file '$file': $!";
}
} # }}}
#
# Statistics
# {{{
my $hashes;
foreach my $dev (keys %$md5sums) {
foreach my $key (keys %{$md5sums->{$dev}}) {
$hashes->{$key} = 1;
}
$stat->{$dev} = {
count => 0,
size => 0,
blksize => 0,
blocks => 0,
};
}
$stat_global->{unique_files} = scalar(keys %$hashes);
# }}}
#
# Get mountpoints info
# {{{
Quota::setmntent();
while (my ($mountdev, $mountpoint, $type, $opts) = Quota::getmntent()) {
my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($mountpoint);
$mountpoints->{$dev} = {
mountpoint => $mountpoint,
mountdev => $mountdev,
} if (defined $dev);
}
Quota::endmntent();
# }}}
#
# Do the real work: hardlink files and compute stats
#
foreach my $dev (keys %$md5sums) {
foreach my $key (keys %{$md5sums->{$dev}}) {
my @arr = @{$md5sums->{$dev}->{$key}};
my $count = scalar(@arr);
if ($count > 1) {
my $source = $arr[0]->{filename};
my $inum = $arr[0]->{ino};
# statistics report # {{{
$stat->{$dev}->{count} += ($count - 1);
$stat->{$dev}->{size} += ($count - 1) * $arr[0]->{size};
$stat->{$dev}->{blocks} += ($count - 1) * $arr[0]->{blocks};
$stat->{$dev}->{blksize} = $arr[0]->{blksize};
# }}}
print $source;
for (my $i = 1; $i < $count; $i++) { # hardlink files {{{
print "\n\t<-- ", $arr[$i]->{filename};
if ($arr[$i]->{ino} == $inum) {
print " [already linked]";
}
else {
if ($arr[0]->{perm} ne $arr[$i]->{perm}) {
print STDERR "\n\nWARNING: file '$arr[$i]->{filename}' not linked to '$arr[0]->{filename}' due permission difference. Security reason.";
}
else {
unlink $arr[$i]->{filename} or print STDERR "\n\nWARNING: nunlink error: $!\n";
link $source, $arr[$i]->{filename} or print STDERR "\n\nWARNING: link error: $!\n";
}
}
} # }}}
print "\n\n";
}
}
}
#print Dumper($stat, $mountpoints);
#print Dumper($md5sums);
#
# Print statistics
# {{{
foreach my $dev (keys %$md5sums) {
my $info = $stat->{$dev};
print '-' x 60, "\n";
if (defined $mountpoints->{$dev}) {
print "$mountpoints->{$dev}->{mountpoint} [$mountpoints->{$dev}->{mountdev}]\n";
}
else {
print "??? [$dev]\n";
}
print " Unique files: ", scalar(keys %{$md5sums->{$dev}}), "\n";
print " Hardlinked files: $info->{count}\n";
print " Saved bytes: $info->{size}\n";
print " Saved blocks: $info->{blocks}\n";
print "Saved bytes in blocks: ", $info->{blksize} * $info->{blocks}, "\n";
}
print '-' x 60, "\n";
print "TOTAL:\n";
print " Total unique files: $stat_global->{unique_files}\n";
print " Elapsed time: ", tv_interval($start, [gettimeofday]), " seconds\n";
# }}}
# vim: ts=4
# vim600: fdm=marker fdl=0 fdc=3
Platon Group <platon@platon.sk> http://platon.sk/
|