#!/usr/bin/perl #checkArchive # is everything in the archive linked in the manifest that should be? # are there missing files? # read in the manifest, pull the links, compare # expand later to check for database entries, and make sure xml file has header # jody DeRidder # 10/06/09 $base = "/srv/archive/"; $errors = "./ArchiveERRORS"; open(OUT, ">".$errors) or die "can't write to $errors\n"; # start with the manifests $manifest = $base."Manifest.html"; open (IN, $manifest) or die "can't read manifest\n"; while ($line = ){ if ($line =~ /.*lockss\/(.*?\/Documentation\/Manifest.html)\">(.*?)<\/a>/){ $manName = $2; $path = $base.$1; if (! -e $path){ print OUT "ERROR: $manName is linked at top manifest but is not at $path\n"; } else{ push (@manifests, $path);} } } close(IN); ## so I now know if anything is linked at the top level that does not exist at the holder level # now check directories against what was in that top manifest opendir(DIR, $base) or die "can't look through $base\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $base.$file; if (-d $path){ push (@dirs, $path."/"); $manifest = $base.$file."/Documentation/Manifest.html"; if (-e $manifest){ # open it, and see if all the existing directories under this one are listed. undef ($found); foreach $m (@manifests){ if ($m eq $manifest){ $found = 1;} } if (! $found){ print OUT "ERROR: $manifest is not linked at the top level\n";} ## Now I know if anything is in the holder level that is not linked in the top level manifest # now open it and look through it open (IN, $manifest) or die "can't read manifest\n"; while ($line = ){ if ($line =~ /.*lockss\/(.*?)(\/Documentation\/Manifest.html)\">(.*?)<\/a>/){ $manName = $3; $path = $base.$1.$2; $colldir = $base.$1; push (@colldirs, $colldir); if (! -e $path){ print OUT "ERROR: $manName is linked at holder manifest but is not at $path\n"; } else{ push (@collmans, $path);} } } close(IN); } else{ print OUT "ERROR: MISSING Manifest: $manifest\n";} ## --> And I know if a holder level has no manifest ## --> Plus I know if a collection level Manifest is linked that does not exist ## --> But I don't yet know if there are collections not linked, or collections without Manifests } } close(DIR); ## --> now I have collection level manifests that are linked in @collmans ## --> I have collection level directories that are linked in @colldirs; foreach $dir (@dirs){ # these are the holder level directories. Do they have any collections not in the @colldirs? opendir(DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $base.$file; if (-d $path){ # do I have this in @colldirs? undef $found; foreach $c (@colldirs){ if ($c eq $path){ $found = 1;} } if (! $found){ print OUT "ERROR: $colldir exists, not linked into holder manifest\n"; push (@colldirs, $path); } $aman = $path."/Documentation/Manifest.html"; if (! -e $aman){ print OUT "ERROR: $colldir has no manifest!!\n";} else { push (@collmans, $aman);} } } close(DIR); } # Now I know what collections have directories but aren't listed in holder manifests # I also know which ones have no manifests. # the collman and colldir arrays are complete # the next step is to go down into each of those # check to see what is not in the manifest that should be there # check to see what does not exist in the directories, but should be there # check to see what is in the manifest that does NOT exist # each collection -level manifest (with the exception of u0002_0000003, Hoole Rare Books, which links to more manifests) # should have a collnum.xml and a collnum.v1.xml which is linked into the Manifest, located in Documentation # should have a collnum.txt and a collnum.v1.txt which is linked into the Manifest, located in Metadata # should have some tiffs or wave files-- check to see if links are good # check all the directories for tiffs and wave files to make sure there aren't any which are not linked foreach $dir (@colldirs){ ($collnum = $dir) =~ s,\/srv\/archive\/,,; $collnum =~ s,\/,\_,g; # print "Collnum is $collnum for $dir\n"; undef @mydirs; $myman = $dir."/Documentation/Manifest.html"; $link1 = $dir."/Documentation/$collnum.v1.xml"; $link2 = $dir."/Metadata/$collnum.v1.txt"; $link3 = $dir."/Metadata/$collnum.1.txt"; undef $hasMeta; @checkthese = ($dir."/Documentation/$collnum.xml", $link1); #, $dir."/Metadata/$collnum.txt", $link2); foreach $file (@checkthese){ if (! -e $file){ print OUT "ERROR: $collnum: $file does not exist\n"; } } @checkthese = ($dir."/Metadata/$collnum.txt", $link2, $link3); foreach $file (@checkthese){ if ( -e $file){ $hasMeta = 1; } } if ((! $hasMeta) && ($collnum ne "u0002_0000003")){ print OUT "ERROR: $collnum: does not have collection-level metadata\n";} # now we know if primary administrative and descriptive metadata exist. We don't yet know if they're linked, though. # if there's a manifest, check it. # check every link... and verify that one is Metadata/collnum.v1.txt, one is Documentation/collnum.v1.xml, and at least one tiff or wave file undef @linked; if (-e $myman){ open (MAN, $myman) or die "can't read $myman\n"; undef $haveMeta; undef $haveAdmin; undef $haveContent; while ($line = ){ if ($line =~ /href=\".*?lockss\/(.*?)\">(.*?)<\/a/){ $cleanme = $1; $afile = $2; $cleanme =~ s,\/\/+,\/,g; #remove multiple slashes from link $apath = $base.$cleanme; $apath =~ s,\/\/+,\/,g; #remove multiple slashes from link if (! -e $apath){ print OUT "ERROR: BAD LINK in $myman: $afile\n";} else{ if ($apath eq $link1){ $haveAdmin = 1;} elsif ($apath eq $link2){ $haveMeta = 1;} elsif ($apath eq $link3){ $havemeta = 1;} elsif ($apath =~ /\.tif$/ || $apath =~ /\.wav/ || $apath =~ /\.pdf/){ $haveContent = 1;} elsif ($apath =~ /Manifest\.html/){ push (@collmans, $apath); } # this for u0002_0000003 push (@linked, $apath); } } } close(MAN); if (!$haveMeta){ if ($collnum ne "u0002_0000003"){ print OUT "ERROR: $collnum does not link any metadata\n";}} if (!$haveAdmin){ print OUT "ERROR: $collnum does not link administrative metadata\n";} if (!$haveContent){ if ($collnum ne "u0002_0000003"){print OUT "ERROR: $collnum does not link any content!\n";}} } # all the links found in the manifest are now in @linked # I now know if the collection has linked metadata, admin info, and content push (@mydirs, $dir); # now go down into the directories and find all the tiffs/wave files, and see if there are any not linked in the manifest foreach $adir (@mydirs){ opendir(DIR, $adir) or die "can't open $adir"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next;} $path = $adir."/".$file; if (-d $path){ if ($collnum eq "u0002_0000003" && $file ne "Documentation" && $file ne "Metadata"){ # rare books have manifests next level down push (@colldirs, $path); next; } push (@mydirs, $path); } elsif ($file =~ /\.tif$/ || $file =~ /\.wav$/ || $file =~ /\.v\d{1,2}\.txt$/ || # $file =~ /\.\d{1,2}\.txt$/ || $file =~ /\.v\d{1,2}\.png$/ || $file =~ /\.v\d{1,2}\.xml$/) { # check to see if linked if ($file =~ /match/i || $file =~ /OAI/i || $file =~ /compound/i) {next;} # don't need to be linked undef $found; $path =~ s,\/\/+,\/,g; #remove multiple slashes from link foreach $l (@linked){ if ($l eq $path){ $found = 1;} } if (! $found){ print OUT "ERROR: $file exists in $adir exists, not linked into collection manifest\n"; } } } close(DIR); } }