#!/usr/bin/perl #####!C:\Perl64\bin\perl.exe use Time::Local; # CabanissCheck # goes through scans directories for EAD linking # picks up box and folder numbers from directory names # checks to see that all tiffs in those directories match those box and folder numbers # checks for missing sequence numbers # expected structure: # Collection_dir # Scans.*?_Box_\d{1,4} # Folder_\d{1,4} # u0003_0000252_BBFF\d{3}(.tif)? or is subdir # u0003_0000252_BBFF\d{3}_\d{3}.tif # where BB is box number, FF is folder number and tiffs are in another subdirectory only if there are pages # check sequence on item and page level # ##Copyright (c) 2009, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 11/12/09. ##All rights reserved. ##Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the ## distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # jody DeRidder, 11/12/09 ×tamp; # print $timestamp."\n"; $output = "S:\\Digital Projects\\Administrative\\scripts\\output\\CabanissTest_".$timestamp.".txt"; $thiscoll= "u0003_0000252"; open (OUT, ">".$output) or die "can't open $output\n"; print "Do you want to check Cabaniss content in \n1) the Complete folder or \n2) the In Progress folder?\n"; print "Please choose a number, then hit enter\n"; $answ = ; if ($answ =~ /1/){ $base = "S:\\Digital\ Projects\\Digital_Coll_Complete\\";} elsif ($answ =~ /2/){$base = "S:\\Digital\ Projects\\Digital_Coll_in_progress\\";} else { print "Sorry, I didn't get that. Exiting now. Please start over. \n"; sleep(2); exit;} opendir (BASE, $base) or die "can't look through $base\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./){next;} # skip dot files $path = $base.$file; if ($file =~ /u0003_0000252/ && -d $path){ $target = $path; } } if (! $target){ print "Unable to locate Cabaniss folder in $base. \n Sorry!!\n\n"; sleep(2); exit;} print "Results will be in the $output file....\n when you see a \"Good bye!\" and this screen closes.\n"; print "Don't open it yet! :-) \n\n"; print ". . . working . . . (be patient) . . . \n\n"; push (@mydirs, $target."\\"); foreach $dir (@mydirs){ opendir(DIR, $dir) or die "can't read through $dir directory\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /Admin/){ next; # skip dot files } $path = $dir.$file; if (-d $path){ if ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros $bf = $boxnum."_".$foldernum; push (@{$ItemSeq{$bf}}, $item_sequence); # print OUT "$boxnum $foldernum $item_sequence $file\n"; } if ($collnum ne $thiscoll){ push (@badform, "$file does not match collection number $thiscoll\n");} } push (@mydirs, $path."\\"); } # not checking subpages elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})(\_(\d{4}))?\.tif$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; $thisPage = $4 +0 ; if ($4){ $apage = 1;} else{ undef $apage;} if ($collnum ne $thiscoll){ push (@badform, "$file does not match collection number $thiscoll\n");} # print "collnum $collnum, item $itemID this Item $thisItem, this page $thisPage\n"; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros } else{ push (@badform, "Can't get boxnum, foldernum, item sequence from $thisItem");} $bf = $boxnum."_".$foldernum; if (! $apage){ # print OUT "$boxnum $foldernum $item_sequence $file\n"; push (@{$ItemSeq{$bf}}, $item_sequence); } # this catches tifs not in subdirectories $tifcount ++; push (@{$collnumItems{$collnum}{$file}}, $dir); # keeps list of directories where items are found undef $box; undef $folder; ($checkdir = $dir) =~ s,.*?(u0003\_0000252),u0003\_0000252,; # pull off first part of this directory name # what is the series number and folder number? if ($checkdir =~ /Box\_(\d{1,2})(_[^\\]*)?\\Folder\_(\d{1,2})(_[^\\]*)?\\([^\\]+\\)?/i){ $box = sprintf ("%02d", $1); # left pad if needed $folder = sprintf ("%02d",$3); $itemdir = $5; if (($box ne $boxnum) || ($folder ne $foldernum)){ push(@badform, "$file name does not reflect Box $box Folder $folder at $path");} $itemdir =~ s,\\,,g; if ($itemdir){ if (!$thisPage){push (@errors, "$file is in $itemdir directory, but has no page number\n");} elsif ($itemdir ne $itemID){ push (@wrongdir, "$file is in $itemdir; no match on item id\n");} else{ push (@{$pages{$itemdir}}, $thisPage );} # get page number sequence without the padding } } else{ push (@errors, "cannot parse out box and folder from $path\n");} } else{ if ($file =~ /thumbs\.db/i || $path =~ /Admin/){ next;} push (@badform, "$file in $checkdir"); } } close(DIR); } $subtract = 0; @allfiles = sort by_number (keys (%{$collnumItems{$collnum}})); foreach $file (@allfiles){ # print "found $file\n"; @mydirs = @{$collnumItems{$collnum}{$file}}; $numdirs = scalar (@mydirs); if ($numdirs > 1){ $collect = join (" \n", @mydirs); push (@dupes, "$file exists in more than one place:\n $collect"); $subtract += $numdirs -1; # subtract from total count of files } } @boxfolders = sort keys(%ItemSeq); foreach $bf (@boxfolders){ ($box, $folder) = split ("_", $bf); # print "Box $box Folder $folder"; @myItems = sort by_number( @{$ItemSeq{$bf}}); $numItems = scalar (@myItems); push (@foldercount, "Box $box Folder $folder contains $numItems items"); $totalItems += $numItems; for ($i = 1; $i <= $numItems; $i++){ if ($myItems[$i-1] ne $i){ push (@badcount, "In Box $box Folder $folder: expected sequence $i instead of ".$myItems[$i-1]); } } } @hasPages = sort keys(%pages); foreach $doc (@hasPages){ @mypages = sort by_number( @{$pages{$doc}}); $numItems = scalar (@mypages); push (@pagecounts, "$doc has $numItems pages"); for ($i = 1; $i <= $numItems; $i++){ if ($mypages[$i-1] ne $i){ push (@badcount, "$doc page problem: expected sequence $i instead of ".$mypages[$i-1]); } } } $all = (scalar @allfiles) - $subtract; print OUT "\nTotal count of valid content in this upload:\n $totalItems items and $all digital files\n"; #print "tiffcount $tifcount\n"; if (@badform || @badcount || @wrongdir || @errors){ print OUT "\nTROUBLE: \n"; print OUT "-------------------------------------------\n"; } else{ print OUT "All is GREAT!! GOOD WORK!! :-) \n"; } if (@errors){ print OUT "\nThese general errors impeded this script, so everything may NOT have been checked:\n"; print OUT "Please repair and run this again.\n"; foreach (@errors){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nThe following folders or items have sequence problems:\n"; foreach (@badcount){ print OUT " $_\n";} } print OUT "\n\nHere's some more info:\nFOLDER COUNTS:\n_________________\n"; foreach (@foldercount){ print OUT $_."\n";} print OUT "\n\nPAGE COUNTS:\n_______________________\n"; foreach (@pagecounts){ print OUT $_."\n";} close OUT; print "Good bye!\n"; exit; sub by_number {$a <=> $b;} sub timestamp{ print "hit enter twice please\n"; $date = `date`; $time = `time`; #print $date."\n"; if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ $date = $3.$1.$2; } #print $date."\n"; if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ $time = $1.$2.$3; } #print $time."\n"; $timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix #($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime($mydate); #$mon ++; #if ($mon < 10){ $mon="0".$mon;} #need 2 digits #if ($sec < 10){ $sec="0".$sec;} #if ($min < 10){ $min="0".$min;} #if ($hour < 10){ $hour="0".$hour;} #if ($mday < 10){ $mday="0".$mday;} #$year = $year + 1900; #$timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; }