#!/usr/bin/perl # testNums # goes through scans directories for EAD linking # picks up box and folder numbers from directory names # checks to see that all tiffs in those directories match those box and folder numbers # checks for missing sequence numbers # expected structure: # Scans* # Box_\d{1,4} # Folder_\d{1,4} # u0003_0000252_BBFF\d{3}(.tif)? or is subdir # u0003_0000252_BBFF\d{3}_\d{3}.tif # where BB is box number, FF is folder number and tiffs are in another subdirectory only if there are pages # check sequence on item and page level # # jody DeRidder, 11/11/09 ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 7/30/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ×tamp; $output = "./output/CabanissTest_".$timestamp.".txt"; open (OUT, ">".$output) or die "can't open $output\n"; $base = "/srv/deposits/Cabaniss/u0003_0000252_Cabaniss/"; $thiscoll = "u0003_0000252"; $tifcount = 0; opendir(CONTENT, $base) or die "can't look through $base\n"; while ($file = readdir(CONTENT)){ if ($file =~ /^\./){ next; # skip dot files } elsif ($file =~ /Scan/i){ $path = $base.$file; if (-d $path){ push (@mydirs, $path."/"); } } } close(CONTENT); foreach $dir (@mydirs){ opendir(DIR, $dir) or die "can't read through $dir directory\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /Admin/){ next; # skip dot files } $path = $dir.$file; if (-d $path){ if ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros $bf = $boxnum."_".$foldernum; push (@{$ItemSeq{$bf}}, $item_sequence); # print OUT "$boxnum $foldernum $item_sequence $file\n"; } if ($collnum ne $thiscoll){ push (@badform, "$file does not match collection number $thiscoll\n");} } push (@mydirs, $path."/"); } # not checking subpages elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})(\_(\d{4}))?\.tif$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; $thisPage = $4 +0 ; if ($4){ $apage = 1;} # print "I believe $file is a page # $thisPage\n";} else{ undef $apage;} if ($collnum ne $thiscoll){ push (@badform, "$file does not match collection number $thiscoll\n");} # print "collnum $collnum, item $itemID this Item $thisItem, this page $thisPage\n"; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros } else{ push (@badform, "Can't get boxnum, foldernum, item sequence from $thisItem");} $bf = $boxnum."_".$foldernum; if (! $apage){ # print OUT "$boxnum $foldernum $item_sequence $file\n"; push (@{$ItemSeq{$bf}}, $item_sequence); } # this catches tifs not in subdirectories $tifcount ++; push (@{$collnumItems{$collnum}{$file}}, $dir); # keeps list of directories where items are found undef $box; undef $folder; ($checkdir = $dir) =~ s,.*?(u0003\_0000252),u0003\_0000252,; # pull off first part of this directory name # what is the series number and folder number? if ($checkdir =~ /Box\_(\d{1,2})(_[^\/]*)?\/Folder\_(\d{1,2})(_[^\/]*)?\/([^\/]+\/)?/i){ $box = sprintf ("%02d", $1); # left pad if needed $folder = sprintf ("%02d",$3); $itemdir = $5; if (($box ne $boxnum) || ($folder ne $foldernum)){ push(@badform, "$file name does not reflect Box $box Folder $folder at $path");} $itemdir =~ s,\/,,g; if ($itemdir){ if (!$thisPage){push (@errors, "$file is in $itemdir directory, but has no page number\n");} elsif ($itemdir ne $itemID){ push (@wrongdir, "$file is in $itemdir; no match on item id\n");} else{ push (@{$pages{$itemdir}}, $thisPage );} # get page number sequence without the padding } } # elsif ($checkdir =~ /Box\_(\d{1,2})(_[^\/]*)?\/([^\/]+\/)?/i){ # $box = sprintf ("%02d", $1); # left pad if needed # $itemdir = $3; # if (($foldernum != 0) || ($box ne $boxnum)){ push(@badform, "$file name does not reflect Box $box (no subfolders) at $path");} # $itemdir =~ s,\/,,g; # if ($itemdir){ # if (!$thisPage){push (@errors, "$file is in $itemdir directory, but has no page number\n");} # elsif ($itemdir ne $itemID){ push (@wrongdir, "$file is in $itemdir; no match on item id\n");} # else{ push (@{$pages{$itemdir}}, $thisPage );} # get page number sequence without the padding # } # } else{ push (@errors, "cannot parse out box and folder from $path\n");} } else{ if ($file =~ /thumbs\.db/i || $path =~ /Admin/){ next;} push (@badform, "$file in $checkdir"); } } close(DIR); } $subtract = 0; @allfiles = sort by_number (keys (%{$collnumItems{$collnum}})); foreach $file (@allfiles){ # print "found $file\n"; @mydirs = @{$collnumItems{$collnum}{$file}}; $numdirs = scalar (@mydirs); if ($numdirs > 1){ $collect = join (" \n", @mydirs); push (@dupes, "$file exists in more than one place:\n $collect"); $subtract += $numdirs -1; # subtract from total count of files } } @boxfolders = sort keys(%ItemSeq); foreach $bf (@boxfolders){ ($box, $folder) = split ("_", $bf); # print "Box $box Folder $folder"; @myItems = sort by_number( @{$ItemSeq{$bf}}); $numItems = scalar (@myItems); push (@foldercount, "Box $box Folder $folder contains $numItems items"); $totalItems += $numItems; for ($i = 1; $i <= $numItems; $i++){ if ($myItems[$i-1] ne $i){ push (@badcount, "In Box $box Folder $folder: expected sequence $i instead of ".$myItems[$i-1]); } } } @hasPages = sort keys(%pages); foreach $doc (@hasPages){ @mypages = sort by_number( @{$pages{$doc}}); $numItems = scalar (@mypages); push (@pagecounts, "$doc has $numItems pages"); for ($i = 1; $i <= $numItems; $i++){ if ($mypages[$i-1] ne $i){ push (@badcount, "$doc page problem: expected sequence $i instead of ".$mypages[$i-1]); } } } $all = (scalar @allfiles) - $subtract; print "\nTotal count of valid content in this upload:\n $totalItems items and $all digital files\n"; #print "tiffcount $tifcount\n"; if (@badform || @badcount || @wrongdir || @errors){ print OUT "\nTROUBLE: \n"; print OUT "-------------------------------------------\n"; } else{ print OUT "All is GREAT!! GOOD WORK!! :-) \n"; } if (@errors){ print OUT "\nThese general errors impeded this script, so everything may NOT have been checked:\n"; print OUT "Please repair and run this again.\n"; foreach (@errors){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nThe following folders or items have sequence problems:\n"; foreach (@badcount){ print OUT " $_\n";} } print OUT "\n\nHere's some more info:\nFOLDER COUNTS:\n_________________\n"; foreach (@foldercount){ print OUT $_."\n";} print OUT "\n\nPAGE COUNTS:\n_______________________\n"; foreach (@pagecounts){ print OUT $_."\n";} close OUT; sub timestamp{ # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; } sub by_number {$a <=> $b;}