#!/usr/bin/perl -w # # hardlink-files.pl - create hardlinks from duplicate files # # Keywords: filesystem cleanup, create hardlinks, MD5 sum, file comparison, Perl cache # # Tested on Linux and FreeBSD. # # Developed by Lubomir Host 'rajo' # Copyright (c) 2005 Platon SDG, http://platon.sk/ # Licensed under terms of GNU General Public License. # All rights reserved. # # Changelog: # 2005-04-17 - created # # Most usefull feature: create cache files with calculated MD5 # for speed optimalization # # # $Platon: scripts/perl/filesystem/hardlink-files.pl,v 1.5 2005/07/10 22:17:13 rajo Exp $ use strict; use Quota; use File::Find; use Cwd qw( abs_path ); use File::Basename qw( basename fileparse ); use Digest::MD5; #use Digest::SHA1; #use Data::Dumper; use XML::Dumper; use Time::HiRes qw(gettimeofday tv_interval); $| = 1; use vars qw ( $start $cache_file *file *dir *prune $md5sums $cache $stat $stat_global $mountpoints @directories $dump ); unless (scalar(@ARGV)) { print "Usage: $0 dir1 [ dir2 [ dir3 ] ]\n"; exit 1; } $dump = new XML::Dumper; $cache_file = ".hardlink-cache.xml"; $start = [gettimeofday]; *file = *File::Find::name; *dir = *File::Find::dir; *prune = *File::Find::prune; sub find_directories() { # {{{ if (-d $file) { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file); my $path = abs_path($file); $path =~ s#/$##g; push @directories, $path; } } # }}} sub make_md5sum() { # {{{ if (-f $file) { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($file); my $abs_file = abs_path($file); my ($filename, $path, $suffix) = fileparse($abs_file, qr{\.[^.]+$}); $filename .= $suffix; return if ($filename eq $cache_file); # ignore .hardlink-cache files $path =~ s#/$##g; my $cache_exists = 0; if ( # {{{ defined($cache->{$path}) and defined($cache->{$path}->{$filename})) { $cache_exists = 1; } # }}} my $cache_valid = 0; if ($cache_exists) { # {{{ if ( $cache->{$path}->{$filename}->{size} == $size and $cache->{$path}->{$filename}->{ino} == $ino and $cache->{$path}->{$filename}->{mtime} == $mtime ) { $cache_valid = 1; } else { print "File '$abs_file' modified, updating cache...\n"; } } # }}} my $md5; if ($cache_valid) { $md5 = $cache->{$path}->{$filename}->{md5}; } else { if (open(FILE, $file)) { binmode(FILE); $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest; close(FILE); } else { print STDERR "Can't open file '$file': $!"; } } push @{$md5sums->{$dev}->{$md5}}, { # {{{ filename => $abs_file, #dir => $dir, dev => $dev, ino => $ino, #mode => $mode, #nlink => $nlink, #uid => $uid, #gid => $gid, rdev => $rdev, size => $size, atime => $atime, mtime => $mtime, ctime => $ctime, blksize => $blksize, blocks => $blocks, }; # }}} } } # }}} find({ wanted => \&find_directories, bydepth => 0, no_chdir => 1, }, @ARGV); # # Load cache # TODO: foreach my $xdir (@directories) { # {{{ my $file = "$xdir/$cache_file"; if (-f $file) { print "Trying to load cache file '$file' ... "; my $hardlink_cache = $dump->xml2pl($file); if (defined $hardlink_cache and ref($hardlink_cache) eq 'HASH') { $cache->{$xdir} = $hardlink_cache; print "OK\n" } } } # }}} find({ wanted => \&make_md5sum, bydepth => 0, no_chdir => 1, }, @ARGV); #print Dumper($md5sums); # # Regenerate cache # undef $cache; foreach my $dev (keys %$md5sums) { # {{{ my $md5list = $md5sums->{$dev}; while (my ($md5, $inf) = each %$md5list) { foreach my $hash (@$inf) { my ($filename, $path, $suffix) = fileparse($hash->{filename}, qr{\.[^.]+$}); $filename .= $suffix; $path =~ s#/$##g; $cache->{$path}->{$filename} = $hash; $cache->{$path}->{$filename}->{md5} = $md5; } } } # }}} #print "CACHE: " . Dumper($cache); # # Save cache # foreach my $xdir (@directories) { # {{{ my $file = "$xdir/$cache_file"; next unless (defined($cache->{$xdir})); print "Saving cache file '$file' ... "; if (open(FILE, ">$file")) { #print Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']); # OLD INSECURE FORMAT OF CACHE #print FILE Data::Dumper->Dump([$cache->{$xdir}], ['$hardlink_cache']); # more safer new format: print FILE $dump->pl2xml($cache->{$xdir}); close(FILE); print "OK\n"; } else { print "failed!\n"; print STDERR "Can't open cache file '$file': $!"; } } # }}} # # Statistics # {{{ my $hashes; foreach my $dev (keys %$md5sums) { foreach my $key (keys %{$md5sums->{$dev}}) { $hashes->{$key} = 1; } $stat->{$dev} = { count => 0, size => 0, blksize => 0, blocks => 0, }; } $stat_global->{unique_files} = scalar(keys %$hashes); # }}} # # Get mountpoints info # {{{ Quota::setmntent(); while (my ($mountdev, $mountpoint, $type, $opts) = Quota::getmntent()) { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = lstat($mountpoint); $mountpoints->{$dev} = { mountpoint => $mountpoint, mountdev => $mountdev, } if (defined $dev); } Quota::endmntent(); # }}} # # Do the real work: hardlink files and compute stats # foreach my $dev (keys %$md5sums) { foreach my $key (keys %{$md5sums->{$dev}}) { my @arr = @{$md5sums->{$dev}->{$key}}; my $count = scalar(@arr); if ($count > 1) { my $source = $arr[0]->{filename}; my $inum = $arr[0]->{ino}; # statistics report # {{{ $stat->{$dev}->{count} += ($count - 1); $stat->{$dev}->{size} += ($count - 1) * $arr[0]->{size}; $stat->{$dev}->{blocks} += ($count - 1) * $arr[0]->{blocks}; $stat->{$dev}->{blksize} = $arr[0]->{blksize}; # }}} print $source; for (my $i = 1; $i < $count; $i++) { # hardlink files {{{ print "\n\t<-- ", $arr[$i]->{filename}; if ($arr[$i]->{ino} == $inum) { print " [already linked]"; } else { unlink $arr[$i]->{filename} or warn "\n\nunlink error: $!\n"; link $source, $arr[$i]->{filename} or warn "\n\nlink error: $!\n"; } } # }}} print "\n\n"; } } } #print Dumper($stat, $mountpoints); #print Dumper($md5sums); # # Print statistics # {{{ foreach my $dev (keys %$md5sums) { my $info = $stat->{$dev}; print '-' x 60, "\n"; if (defined $mountpoints->{$dev}) { print "$mountpoints->{$dev}->{mountpoint} [$mountpoints->{$dev}->{mountdev}]\n"; } else { print "??? [$dev]\n"; } print " Unique files: ", scalar(keys %{$md5sums->{$dev}}), "\n"; print " Hardlinked files: $info->{count}\n"; print " Saved bytes: $info->{size}\n"; print " Saved blocks: $info->{blocks}\n"; print "Saved bytes in blocks: ", $info->{blksize} * $info->{blocks}, "\n"; } print '-' x 60, "\n"; print "TOTAL:\n"; print " Total unique files: $stat_global->{unique_files}\n"; print " Elapsed time: ", tv_interval($start, [gettimeofday]), " seconds\n"; # }}} # vim: ts=4 # vim600: fdm=marker fdl=0 fdc=3