#!/usr/bin/perl #checkArchive # is everything in the archive linked in the manifest that should be? # are there missing files? # read in the manifest, pull the links, compare # expand later to check for database entries, and make sure xml file has header ##Copyright (c) 2009, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 10/06/09. ##All rights reserved. ##Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the ## distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # jody DeRidder # 10/06/09 $base = "/srv/archive/"; $errors = "./ArchiveERRORS"; open(OUT, ">".$errors) or die "can't write to $errors\n"; # start with the manifests $manifest = $base."Manifest.html"; open (IN, $manifest) or die "can't read manifest\n"; while ($line = ){ if ($line =~ /.*lockss\/(.*?\/Documentation\/Manifest.html)\">(.*?)<\/a>/){ $manName = $2; $path = $base.$1; if (! -e $path){ print OUT "ERROR: $manName is linked at top manifest but is not at $path\n"; } else{ push (@manifests, $path);} } } close(IN); ## so I now know if anything is linked at the top level that does not exist at the holder level # now check directories against what was in that top manifest opendir(DIR, $base) or die "can't look through $base\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $base.$file; if (-d $path){ push (@dirs, $path."/"); $manifest = $base.$file."/Documentation/Manifest.html"; if (-e $manifest){ # open it, and see if all the existing directories under this one are listed. undef ($found); foreach $m (@manifests){ if ($m eq $manifest){ $found = 1;} } if (! $found){ print OUT "ERROR: $manifest is not linked at the top level\n";} ## Now I know if anything is in the holder level that is not linked in the top level manifest # now open it and look through it open (IN, $manifest) or die "can't read manifest\n"; while ($line = ){ if ($line =~ /.*lockss\/(.*?)(\/Documentation\/Manifest.html)\">(.*?)<\/a>/){ $manName = $3; $path = $base.$1.$2; $colldir = $base.$1; push (@colldirs, $colldir); if (! -e $path){ print OUT "ERROR: $manName is linked at holder manifest but is not at $path\n"; } else{ push (@collmans, $path);} } } close(IN); } else{ print OUT "ERROR: MISSING Manifest: $manifest\n";} ## --> And I know if a holder level has no manifest ## --> Plus I know if a collection level Manifest is linked that does not exist ## --> But I don't yet know if there are collections not linked, or collections without Manifests } } close(DIR); ## --> now I have collection level manifests that are linked in @collmans ## --> I have collection level directories that are linked in @colldirs; foreach $dir (@dirs){ # these are the holder level directories. Do they have any collections not in the @colldirs? opendir(DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $base.$file; if (-d $path){ # do I have this in @colldirs? undef $found; foreach $c (@colldirs){ if ($c eq $path){ $found = 1;} } if (! $found){ print OUT "ERROR: $colldir exists, not linked into holder manifest\n"; push (@colldirs, $path); } $aman = $path."/Documentation/Manifest.html"; if (! -e $aman){ print OUT "ERROR: $colldir has no manifest!!\n";} else { push (@collmans, $aman);} } } close(DIR); } # Now I know what collections have directories but aren't listed in holder manifests # I also know which ones have no manifests. # the collman and colldir arrays are complete # the next step is to go down into each of those # check to see what is not in the manifest that should be there # check to see what does not exist in the directories, but should be there # check to see what is in the manifest that does NOT exist # each collection -level manifest (with the exception of u0002_0000003, Hoole Rare Books, which links to more manifests) # should have a collnum.xml and a collnum.v1.xml which is linked into the Manifest, located in Documentation # should have a collnum.txt and a collnum.v1.txt which is linked into the Manifest, located in Metadata # should have some tiffs or wave files-- check to see if links are good # check all the directories for tiffs and wave files to make sure there aren't any which are not linked foreach $dir (@colldirs){ ($collnum = $dir) =~ s,\/srv\/archive\/,,; $collnum =~ s,\/,\_,g; # print "Collnum is $collnum for $dir\n"; undef @mydirs; $myman = $dir."/Documentation/Manifest.html"; $link1 = $dir."/Documentation/$collnum.v1.xml"; $link2 = $dir."/Metadata/$collnum.v1.txt"; $link3 = $dir."/Metadata/$collnum.1.txt"; undef $hasMeta; @checkthese = ($dir."/Documentation/$collnum.xml", $link1); #, $dir."/Metadata/$collnum.txt", $link2); foreach $file (@checkthese){ if (! -e $file){ print OUT "ERROR: $collnum: $file does not exist\n"; } } @checkthese = ($dir."/Metadata/$collnum.txt", $link2, $link3); foreach $file (@checkthese){ if ( -e $file){ $hasMeta = 1; } } if ((! $hasMeta) && ($collnum ne "u0002_0000003")){ print OUT "ERROR: $collnum: does not have collection-level metadata\n";} # now we know if primary administrative and descriptive metadata exist. We don't yet know if they're linked, though. # if there's a manifest, check it. # check every link... and verify that one is Metadata/collnum.v1.txt, one is Documentation/collnum.v1.xml, and at least one tiff or wave file undef @linked; if (-e $myman){ open (MAN, $myman) or die "can't read $myman\n"; undef $haveMeta; undef $haveAdmin; undef $haveContent; while ($line = ){ if ($line =~ /href=\".*?lockss\/(.*?)\">(.*?)<\/a/){ $cleanme = $1; $afile = $2; $cleanme =~ s,\/\/+,\/,g; #remove multiple slashes from link $apath = $base.$cleanme; $apath =~ s,\/\/+,\/,g; #remove multiple slashes from link if (! -e $apath){ print OUT "ERROR: BAD LINK in $myman: $afile\n";} else{ if ($apath eq $link1){ $haveAdmin = 1;} elsif ($apath eq $link2){ $haveMeta = 1;} elsif ($apath eq $link3){ $havemeta = 1;} elsif ($apath =~ /\.tif$/ || $apath =~ /\.wav/ || $apath =~ /\.pdf/){ $haveContent = 1;} elsif ($apath =~ /Manifest\.html/){ push (@collmans, $apath); } # this for u0002_0000003 push (@linked, $apath); } } } close(MAN); if (!$haveMeta){ if ($collnum ne "u0002_0000003"){ print OUT "ERROR: $collnum does not link any metadata\n";}} if (!$haveAdmin){ print OUT "ERROR: $collnum does not link administrative metadata\n";} if (!$haveContent){ if ($collnum ne "u0002_0000003"){print OUT "ERROR: $collnum does not link any content!\n";}} } # all the links found in the manifest are now in @linked # I now know if the collection has linked metadata, admin info, and content push (@mydirs, $dir); # now go down into the directories and find all the tiffs/wave files, and see if there are any not linked in the manifest foreach $adir (@mydirs){ opendir(DIR, $adir) or die "can't open $adir"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $adir."/".$file; if (-d $path){ if ($collnum eq "u0002_0000003" && $file ne "Documentation" && $file ne "Metadata"){ # rare books have manifests next level down push (@colldirs, $path); next; } push (@mydirs, $path); } elsif ($file =~ /\.tif$/ || $file =~ /\.wav$/ || $file =~ /\.v\d{1,2}\.txt$/ || # $file =~ /\.\d{1,2}\.txt$/ || $file =~ /\.v\d{1,2}\.png$/ || $file =~ /\.v\d{1,2}\.xml$/) { # check to see if linked if ($file =~ /match/i || $file =~ /OAI/i || $file =~ /compound/i) {next;} # don't need to be linked undef $found; $path =~ s,\/\/+,\/,g; #remove multiple slashes from link foreach $l (@linked){ if ($l eq $path){ $found = 1;} } if (! $found){ print OUT "ERROR: $file exists in $adir exists, not linked into collection manifest\n"; } } } close(DIR); } }