#!/usr/bin/perl #checkArchive # is everything in the archive linked in the manifest that should be? # are there missing files? # read in the manifest, pull the links, compare # expand later to check for database entries, and make sure xml file has header # jody DeRidder # 10/06/09 ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 7/30/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $base = "/srv/archive/"; $errors = "./ArchiveERRORS"; open(OUT, ">".$errors) or die "can't write to $errors\n"; # start with the manifests $manifest = $base."Manifest.html"; open (IN, $manifest) or die "can't read manifest\n"; while ($line = ){ #
  • u0003_0000580_0000008_0001.tif
  • if ($line =~ /.*lockss\/(.*?\/Documentation\/Manifest.html)\">(.*?)<\/a>/){ $manName = $2; $path = $base.$1; if (! -e $path){ print OUT "ERROR: $manName is linked at top manifest but is not at $path\n"; } else{ push (@manifests, $path);} } } close(IN); ## so I now know if anything is linked at the top level that does not exist at the holder level # now check directories against what was in that top manifest opendir(DIR, $base) or die "can't look through $base\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $base.$file; if (-d $path){ push (@dirs, $path."/"); $manifest = $base.$file."/Documentation/Manifest.html"; if (-e $manifest){ # open it, and see if all the existing directories under this one are listed. undef ($found); foreach $m (@manifests){ if ($m eq $manifest){ $found = 1;} } if (! $found){ print OUT "ERROR: $manifest is not linked at the top level\n";} ## Now I know if anything is in the holder level that is not linked in the top level manifest # now open it and look through it open (IN, $manifest) or die "can't read manifest\n"; while ($line = ){ #
  • u0003_0000580_0000008_0001.tif
  • if ($line =~ /.*lockss\/(.*?)(\/Documentation\/Manifest.html)\">(.*?)<\/a>/){ $manName = $3; $path = $base.$1.$2; $colldir = $base.$1; push (@colldirs, $colldir); if (! -e $path){ print OUT "ERROR: $manName is linked at holder manifest but is not at $path\n"; } else{ push (@collmans, $path);} } } close(IN); } else{ print OUT "ERROR: MISSING Manifest: $manifest\n";} ## --> And I know if a holder level has no manifest ## --> Plus I know if a collection level Manifest is linked that does not exist ## --> But I don't yet know if there are collections not linked, or collections without Manifests } } close(DIR); ## --> now I have collection level manifests that are linked in @collmans ## --> I have collection level directories that are linked in @colldirs; foreach $dir (@dirs){ # these are the holder level directories. Do they have any collections not in the @colldirs? opendir(DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $base.$file; if (-d $path){ # do I have this in @colldirs? undef $found; foreach $c (@colldirs){ if ($c eq $path){ $found = 1;} } if (! $found){ print OUT "ERROR: $colldir exists, not linked into holder manifest\n"; push (@colldirs, $path); } $aman = $path."/Documentation/Manifest.html"; if (! -e $aman){ print OUT "ERROR: $colldir has no manifest!!\n";} else { push (@collmans, $aman);} } } close(DIR); } # Now I know what collections have directories but aren't listed in holder manifests # I also know which ones have no manifests. # the collman and colldir arrays are complete # the next step is to go down into each of those # check to see what is not in the manifest that should be there # check to see what does not exist in the directories, but should be there # check to see what is in the manifest that does NOT exist # each collection -level manifest (with the exception of u0002_0000003, Hoole Rare Books, which links to more manifests) # should have a collnum.xml and a collnum.v1.xml which is linked into the Manifest, located in Documentation # should have a collnum.txt and a collnum.v1.txt which is linked into the Manifest, located in Metadata # should have some tiffs or wave files-- check to see if links are good # check all the directories for tiffs and wave files to make sure there aren't any which are not linked foreach $dir (@colldirs){ ($collnum = $dir) =~ s,\/srv\/archive\/,,; $collnum =~ s,\/,\_,g; # print "Collnum is $collnum for $dir\n"; undef @mydirs; $myman = $dir."/Documentation/Manifest.html"; $link1 = $dir."/Documentation/$collnum.v1.xml"; $link2 = $dir."/Metadata/$collnum.1.txt"; $link3 = $dir."/Metadata/$collnum.v1.txt"; $link4 = $dir."/Metadata/$collnum.1.v1.txt"; $ead = $dir."/Metadata/".$collnum.".ead.v1.xml"; $docfile = $dir."/Documentation/$collnum.xml"; undef $hasMeta; if ((! -e $link1) && (! -e $ead)){ print OUT "ERROR: $collnum: $link1 does not exist, and it has no ead ($ead)\n";} if ((! -e $link3) && (! -e $link4) && (! -e $ead) && ($collnum ne "u0002_0000003") && (!($collnum =~ /u0015/))){ print OUT "ERROR: $collnum: does not have collection-level metadata\n";} # now we know if primary administrative and descriptive metadata exist. We don't yet know if they're linked, though. # if there's a manifest, check it. # check every link... and verify that one is Metadata/collnum.v1.txt, one is Documentation/collnum.v1.xml, and at least one tiff or wave file undef @linked; if (-e $myman){ open (MAN, $myman) or die "can't read $myman\n"; undef $haveMeta; undef $haveAdmin; undef $haveContent; while ($line = ){ if ($line =~ /href=\".*?lockss\/(.*?)\">(.*?)<\/a/){ $cleanme = $1; $afile = $2; $cleanme =~ s,\/\/+,\/,g; #remove multiple slashes from link $apath = $base.$cleanme; $apath =~ s,\/\/+,\/,g; #remove multiple slashes from link if (! -e $apath){ print OUT "ERROR: BAD LINK in $myman: $afile\n";} else{ if ($apath eq $link1){ $haveAdmin = 1;} elsif ($apath eq $link3){ $haveMeta = 1;} elsif ($apath eq $link4){ $haveMeta = 1;} elsif ($apath eq $ead){ $haveMeta = 1;} elsif ($apath =~ /\.tif$/ || $apath =~ /\.wav/ || $apath =~ /\.pdf/){ $haveContent = 1;} elsif ($apath =~ /Manifest\.html/){ push (@collmans, $apath); } # this for u0002_0000003 push (@linked, $apath); } } } close(MAN); if (!$haveMeta){ if ($collnum ne "u0002_0000003" && $collnum ne "u0015_0000001"){ print OUT "ERROR: $collnum does not link any collection level metadata\n";}} if (!$haveAdmin && (! -e $ead)){ print OUT "ERROR: $collnum does not link administrative metadata\n";} if (!$haveContent && (! -e $ead) && ($collnum ne "u0002_0000003")){print OUT "ERROR: $collnum has no ead ($ead) and does not link any content!\n";} } # all the links found in the manifest are now in @linked # I now know if the collection has linked metadata, admin info, and content push (@mydirs, $dir); # now go down into the directories and find all the tiffs/wave files, and see if there are any not linked in the manifest foreach $adir (@mydirs){ opendir(DIR, $adir) or die "can't open $adir"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $adir."/".$file; if (-d $path){ if ($collnum eq "u0002_0000003" && $file ne "Documentation" && $file ne "Metadata"){ # rare books have manifests next level down push (@colldirs, $path); next; } push (@mydirs, $path); } elsif ($file =~ /\.tif$/ || $file =~ /\.wav$/ || $file =~ /\.v\d{1,2}\.txt$/ || # $file =~ /\.\d{1,2}\.txt$/ || $file =~ /\.v\d{1,2}\.png$/ || $file =~ /\.v\d{1,2}\.xml$/) { # check to see if linked if ($file =~ /match/i || $file =~ /OAI/i || $file =~ /compound/i || $file =~ /ocr/ || $file =~ /log/ || $file =~ /mets/) {next;} # don't need to be linked undef $found; $path =~ s,\/\/+,\/,g; #remove multiple slashes from link foreach $l (@linked){ if ($l eq $path){ $found = 1;} } if (! $found){ print OUT "ERROR: $file exists in $adir exists, not linked into collection manifest\n"; } } } close(DIR); } } close OUT; print "check $errors for errors! \n";