#!/usr/bin/perl #####!C:\Perl64\bin\perl.exe use Time::Local; # BoxFolderCheck # asks for collection number # goes through scans directories for EAD linking # picks up box and folder numbers from directory names # checks to see that all tiffs in those directories match those box and folder numbers # checks for missing sequence numbers # expected structure: # Collection_dir # Scans.*?_Box_\d{1,4} # Folder_\d{1,4} # collnum_BBFF\d{3}(.tif)? or is subdir # collnum_BBFF\d{3}_\d{3}.tif # (collnum is letter, 4 digits, underscore,7 digits) # where BB is box number, FF is folder number and tiffs are in another subdirectory only if there are pages # check sequence on item and page level #This script is PC/MAC compatible. #For MAC compatibility: #any backwards path slashes (i.e. "\\") are replaced with "\/". #whitespaces and colons are stripped from the $timestamp variable; otherwise the timestamped report file can't be written as it would contain whitespaces/colons. #for the Mac OS to be able to read the shebang line this file must be saved with a UNIX style byte-order. # # jody DeRidder, 11/12/09 # adapted from CabanissCheck on 10/13/10 # adapted to Mac and improved by Nitin Arora, 10/14/10 ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder and Nitin Arora, 10/18/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. print "\nPlease enter the collection number:\n\n"; $answ = ; chomp $answ; if ($answ =~ /^[a-z]{1}\d{4}\_\d{7}$/){ $collnum = $answ; } else{ print "$answ is not a valid collection number. Please start over.\n"; exit; } print "\nAre you working on a Mac (enter \"m\") or a PC (enter \"p\")?\n\n"; $answ = ; chomp $answ; if ($answ =~ /m/i){ $type = "mac"; } elsif ($answ =~ /p/i){ $type = "pc"; } else{ print "$answ is not p or m. Please start over.\n"; exit; } ×tamp; # print $timestamp."\n"; if ($type eq "pc"){ $output = "S:\\Digital Projects\\Administrative\\scripts\\output\\$collnum_".$collnum.".".$timestamp.".txt"; } else{ $output = "\/Volumes\/SHARE\/Digital\ Projects\/Administrative\/scripts\/output\/$collnum_".$collnum.".".$timestamp.".txt"; } $thiscoll= $collnum; open (OUT, ">".$output) or die "can't open $output\n"; print "Do you want to check content in \n1) the Complete folder or \n2) the In Progress folder?\n"; print "Please choose a number, then hit enter\n"; $answ = ; if ($answ =~ /1/){ if ($type eq "pc"){ $base = "S:\\Digital\ Projects\\Digital_Coll_Complete\\"; } else{ $base = "\/Volumes\/SHARE\/Digital\ Projects\/Digital_Coll_Complete\/";} } elsif ($answ =~ /2/){ if ($type eq "pc"){ $base = "S:\\Digital\ Projects\\Digital_Coll_in_progress\\"; } else{ $base = "\/Volumes\/SHARE\/Digital\ Projects\/Digital_Coll_in_progress\/";} } else { print "Sorry, I didn't get that. Exiting now. Please start over. \n"; sleep(2); exit;} opendir (BASE, $base) or die "can't look through $base\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./){next;} # skip dot files $path = $base.$file; if ($file =~ /$collnum/ && -d $path){ $target = $path; } } $path = $target; #without this, the $path will not search in the folder associated with the user-inputted collection number if (! $target){ print "Unable to locate $collnum folder in $base. \n Sorry!!\n\n"; sleep(2); exit;} print "Results will be in the $output file....\n when you see a \"Good bye!\" and this screen closes.\n"; print "Don't open it yet! :-) \n\n"; print ". . . working . . . (be patient) . . . \n\n"; if ($type eq "pc"){ push (@mydirs, $path."\\"); } else{ push (@mydirs, $path."\/");} foreach $dir (@mydirs){ opendir(DIR, $dir) or die "can't read through $dir directory\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /Admin/ || $file =~ /Transcripts/ || $file =~ /upload/i){ next; # skip dot files } $path = $dir.$file; if (-d $path){ if ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros $bf = $boxnum."_".$foldernum; push (@{$ItemSeq{$bf}}, $item_sequence); # print OUT "$boxnum $foldernum $item_sequence $file\n"; } if ($collnum ne $thiscoll){ push (@badform, "$file does not match collection number $thiscoll\n");} } if ($type eq "pc"){ push (@mydirs, $path."\\"); } else{ push (@mydirs, $path."\/");} } # not checking subpages elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})(\_(\d{4}))?\.tif$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; $thisPage = $4 +0 ; if ($4){ $apage = 1;} else{ undef $apage;} if ($collnum ne $thiscoll){ push (@badform, "$file does not match collection number $thiscoll\n");} # print "collnum $collnum, item $itemID this Item $thisItem, this page $thisPage\n"; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros } else{ push (@badform, "Can't get boxnum, foldernum, item sequence from $thisItem");} $bf = $boxnum."_".$foldernum; if (! $apage){ # print OUT "$boxnum $foldernum $item_sequence $file\n"; push (@{$ItemSeq{$bf}}, $item_sequence); } # this catches tifs not in subdirectories $tifcount ++; push (@{$collnumItems{$collnum}{$file}}, $dir); # keeps list of directories where items are found undef $box; undef $folder; ($checkdir = $dir) =~ s,.*?($thiscoll),$thiscoll,; # pull off first part of this directory name # what is the series number and folder number? if (($checkdir =~ /Box\_(\d{1,2})(_[^\\]*)?\\Folder\_(\d{1,2})(_[^\\]*)?\\([^\\]+\\)?/i) || ($checkdir =~ /Box\_(\d{1,2})(_[^\/]*)?\/Folder\_(\d{1,2})(_[^\/]*)?\/([^\/]+\/)?/i) ) { $box = sprintf ("%02d", $1); # left pad if needed $folder = sprintf ("%02d",$3); $itemdir = $5; if (($box ne $boxnum) || ($folder ne $foldernum)){ push(@badform, "$file name does not reflect Box $box Folder $folder at $path");} if ($type eq "pc"){ $itemdir =~ s,\\,,g;} else{ $itemdir =~ s,\/,,g;} if ($itemdir){ if (!$thisPage){push (@errors, "$file is in $itemdir directory, but has no page number\n");} elsif ($itemdir ne $itemID){ push (@wrongdir, "$file is in $itemdir; no match on item id\n");} else{ push (@{$pages{$itemdir}}, $thisPage );} # get page number sequence without the padding } } else{ push (@errors, "cannot parse out box and folder from $path\n");} } else{ if ($file =~ /thumbs\.db/i || $path =~ /Admin/){ next;} push (@badform, "$file $checkdir"); } } close(DIR); } $subtract = 0; @allfiles = sort by_number (keys (%{$collnumItems{$collnum}})); foreach $file (@allfiles){ # print "found $file\n"; @mydirs = @{$collnumItems{$collnum}{$file}}; $numdirs = scalar (@mydirs); if ($numdirs > 1){ $collect = join (" \n", @mydirs); push (@dupes, "$file exists in more than one place:\n $collect"); $subtract += $numdirs -1; # subtract from total count of files } } @boxfolders = sort keys(%ItemSeq); foreach $bf (@boxfolders){ ($box, $folder) = split ("_", $bf); # print "Box $box Folder $folder"; @myItems = sort by_number( @{$ItemSeq{$bf}}); $numItems = scalar (@myItems); push (@foldercount, "Box $box Folder $folder contains $numItems items"); $totalItems += $numItems; for ($i = 1; $i <= $numItems; $i++){ if ($myItems[$i-1] ne $i){ push (@badcount, "In Box $box Folder $folder: expected sequence $i instead of ".$myItems[$i-1]); } } } @hasPages = sort keys(%pages); foreach $doc (@hasPages){ @mypages = sort by_number( @{$pages{$doc}}); $numItems = scalar (@mypages); push (@pagecounts, "$doc has $numItems pages"); for ($i = 1; $i <= $numItems; $i++){ if ($mypages[$i-1] ne $i){ push (@badcount, "$doc page problem: expected sequence $i instead of ".$mypages[$i-1]); } } } $all = (scalar @allfiles) - $subtract; print OUT "\nTotal count of valid content in this upload:\n $totalItems items and $all digital files\n"; #print "tiffcount $tifcount\n"; if (@badform || @badcount || @wrongdir || @errors){ print OUT "\nTROUBLE: \n"; print OUT "-------------------------------------------\n"; } else{ print OUT "All is GREAT!! GOOD WORK!! :-) \n"; } if (@errors){ print OUT "\nThese general errors impeded this script, so everything may NOT have been checked:\n"; print OUT "Please repair and run this again.\n"; foreach (@errors){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nThe following folders or items have sequence problems:\n"; foreach (@badcount){ print OUT " $_\n";} } print OUT "\n\nHere's some more info:\nFOLDER COUNTS:\n_________________\n"; foreach (@foldercount){ print OUT $_."\n";} print OUT "\n\nPAGE COUNTS:\n_______________________\n"; foreach (@pagecounts){ print OUT $_."\n";} close OUT; print "Good bye!\n"; exit; sub by_number {$a <=> $b;} sub timestamp{ print "hit enter twice please\n"; $date = `date`; $time = `time`; #print $date."\n"; if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ $date = $3.$1.$2; } #print $date."\n"; if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ $time = $1.$2.$3; } #print $time."\n"; $timestamp = $date."T".$time; #start MAC block #trimming whitespace and colons because the MAC won't write the output file if they are present/ $timestamp=~ s/\s//g; $timestamp=~ s/\://g; #end MAC block #print $timestamp."\n"; # following for unix #($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime($mydate); #$mon ++; #if ($mon < 10){ $mon="0".$mon;} #need 2 digits #if ($sec < 10){ $sec="0".$sec;} #if ($min < 10){ $min="0".$min;} #if ($hour < 10){ $hour="0".$hour;} #if ($mday < 10){ $mday="0".$mday;} #$year = $year + 1900; #$timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; }