#!/usr/bin/perl #####!C:\Perl64\bin\perl.exe use Time::Local; # CabanissCheck # goes through scans directories for EAD linking # picks up box and folder numbers from directory names # checks to see that all tiffs in those directories match those box and folder numbers # checks for missing sequence numbers # expected structure: # Collection_dir # Scans.*?_Box_\d{1,4} # Folder_\d{1,4} # u0003_0000252_BBFF\d{3}(.tif)? or is subdir # u0003_0000252_BBFF\d{3}_\d{3}.tif # where BB is box number, FF is folder number and tiffs are in another subdirectory only if there are pages # check sequence on item and page level # # jody DeRidder, 11/12/09 ×tamp; # print $timestamp."\n"; $output = "S:\\Digital Projects\\Administrative\\scripts\\output\\CabanissTest_".$timestamp.".txt"; $thiscoll= "u0003_0000252"; open (OUT, ">".$output) or die "can't open $output\n"; print "Do you want to check Cabaniss content in \n1) the Complete folder or \n2) the In Progress folder?\n"; print "Please choose a number, then hit enter\n"; $answ = ; if ($answ =~ /1/){ $base = "S:\\Digital\ Projects\\Digital_Coll_Complete\\";} elsif ($answ =~ /2/){$base = "S:\\Digital\ Projects\\Digital_Coll_in_progress\\";} else { print "Sorry, I didn't get that. Exiting now. Please start over. \n"; sleep(2); exit;} opendir (BASE, $base) or die "can't look through $base\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./){next;} # skip dot files $path = $base.$file; if ($file =~ /u0003_0000252/ && -d $path){ $target = $path; } } if (! $target){ print "Unable to locate Cabaniss folder in $base. \n Sorry!!\n\n"; sleep(2); exit;} print "Results will be in the $output file....\n when you see a \"Good bye!\" and this screen closes.\n"; print "Don't open it yet! :-) \n\n"; print ". . . working . . . (be patient) . . . \n\n"; push (@mydirs, $target."\\"); foreach $dir (@mydirs){ opendir(DIR, $dir) or die "can't read through $dir directory\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /Admin/){ next; # skip dot files } $path = $dir.$file; if (-d $path){ if ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros $bf = $boxnum."_".$foldernum; push (@{$ItemSeq{$bf}}, $item_sequence); # print OUT "$boxnum $foldernum $item_sequence $file\n"; } if ($collnum ne $thiscoll){ push (@badform, "$file does not match collection number $thiscoll\n");} } push (@mydirs, $path."\\"); } # not checking subpages elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})(\_(\d{4}))?\.tif$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; $thisPage = $4 +0 ; if ($4){ $apage = 1;} else{ undef $apage;} if ($collnum ne $thiscoll){ push (@badform, "$file does not match collection number $thiscoll\n");} # print "collnum $collnum, item $itemID this Item $thisItem, this page $thisPage\n"; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros } else{ push (@badform, "Can't get boxnum, foldernum, item sequence from $thisItem");} $bf = $boxnum."_".$foldernum; if (! $apage){ # print OUT "$boxnum $foldernum $item_sequence $file\n"; push (@{$ItemSeq{$bf}}, $item_sequence); } # this catches tifs not in subdirectories $tifcount ++; push (@{$collnumItems{$collnum}{$file}}, $dir); # keeps list of directories where items are found undef $box; undef $folder; ($checkdir = $dir) =~ s,.*?(u0003\_0000252),u0003\_0000252,; # pull off first part of this directory name # what is the series number and folder number? if ($checkdir =~ /Box\_(\d{1,2})(_[^\\]*)?\\Folder\_(\d{1,2})(_[^\\]*)?\\([^\\]+\\)?/i){ $box = sprintf ("%02d", $1); # left pad if needed $folder = sprintf ("%02d",$3); $itemdir = $5; if (($box ne $boxnum) || ($folder ne $foldernum)){ push(@badform, "$file name does not reflect Box $box Folder $folder at $path");} $itemdir =~ s,\\,,g; if ($itemdir){ if (!$thisPage){push (@errors, "$file is in $itemdir directory, but has no page number\n");} elsif ($itemdir ne $itemID){ push (@wrongdir, "$file is in $itemdir; no match on item id\n");} else{ push (@{$pages{$itemdir}}, $thisPage );} # get page number sequence without the padding } } else{ push (@errors, "cannot parse out box and folder from $path\n");} } else{ if ($file =~ /thumbs\.db/i || $path =~ /Admin/){ next;} push (@badform, "$file in $checkdir"); } } close(DIR); } $subtract = 0; @allfiles = sort by_number (keys (%{$collnumItems{$collnum}})); foreach $file (@allfiles){ # print "found $file\n"; @mydirs = @{$collnumItems{$collnum}{$file}}; $numdirs = scalar (@mydirs); if ($numdirs > 1){ $collect = join (" \n", @mydirs); push (@dupes, "$file exists in more than one place:\n $collect"); $subtract += $numdirs -1; # subtract from total count of files } } @boxfolders = sort keys(%ItemSeq); foreach $bf (@boxfolders){ ($box, $folder) = split ("_", $bf); # print "Box $box Folder $folder"; @myItems = sort by_number( @{$ItemSeq{$bf}}); $numItems = scalar (@myItems); push (@foldercount, "Box $box Folder $folder contains $numItems items"); $totalItems += $numItems; for ($i = 1; $i <= $numItems; $i++){ if ($myItems[$i-1] ne $i){ push (@badcount, "In Box $box Folder $folder: expected sequence $i instead of ".$myItems[$i-1]); } } } @hasPages = sort keys(%pages); foreach $doc (@hasPages){ @mypages = sort by_number( @{$pages{$doc}}); $numItems = scalar (@mypages); push (@pagecounts, "$doc has $numItems pages"); for ($i = 1; $i <= $numItems; $i++){ if ($mypages[$i-1] ne $i){ push (@badcount, "$doc page problem: expected sequence $i instead of ".$mypages[$i-1]); } } } $all = (scalar @allfiles) - $subtract; print OUT "\nTotal count of valid content in this upload:\n $totalItems items and $all digital files\n"; #print "tiffcount $tifcount\n"; if (@badform || @badcount || @wrongdir || @errors){ print OUT "\nTROUBLE: \n"; print OUT "-------------------------------------------\n"; } else{ print OUT "All is GREAT!! GOOD WORK!! :-) \n"; } if (@errors){ print OUT "\nThese general errors impeded this script, so everything may NOT have been checked:\n"; print OUT "Please repair and run this again.\n"; foreach (@errors){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nThe following folders or items have sequence problems:\n"; foreach (@badcount){ print OUT " $_\n";} } print OUT "\n\nHere's some more info:\nFOLDER COUNTS:\n_________________\n"; foreach (@foldercount){ print OUT $_."\n";} print OUT "\n\nPAGE COUNTS:\n_______________________\n"; foreach (@pagecounts){ print OUT $_."\n";} close OUT; print "Good bye!\n"; exit; sub by_number {$a <=> $b;} sub timestamp{ print "hit enter twice please\n"; $date = `date`; $time = `time`; #print $date."\n"; if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ $date = $3.$1.$2; } #print $date."\n"; if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ $time = $1.$2.$3; } #print $time."\n"; $timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix #($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime($mydate); #$mon ++; #if ($mon < 10){ $mon="0".$mon;} #need 2 digits #if ($sec < 10){ $sec="0".$sec;} #if ($min < 10){ $min="0".$min;} #if ($hour < 10){ $hour="0".$hour;} #if ($mday < 10){ $mday="0".$mday;} #$year = $year + 1900; #$timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; }