#!C:\Perl64\bin\perl.exe use Time::Local; # filenamesAndTranscripts # locate all the bad filenames # things in the wrong directory # if it is not a tiff, # checks to ensure that there is a tiff for each -- if not in same directory, then in # one of the scans directories # checks to make sure we don't have BOTH a .txt AND a .ocr.txt for the file # OK to have tiff and txt or ocr.txt # if Transcripts contains both tiffs and txt or ocr.txt files, then must have a one-to-one # correspondence between them. # # jody DeRidder, 4/23/10 ##Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 5/10/10. ##All rights reserved. ##Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the ## distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ×tamp; #print $timestamp."\n"; $output = "..\\output\\BadFileNames".$timestamp.".txt"; open (OUT, ">".$output) or die "can't open $output\n"; print "Results will be in the $output file....\n when you see a \"Good bye!\" and this screen closes.\n"; print "Don't open it yet! :-) \n\n"; @bases = ("S:\\Digital\ Projects\\Digital_Coll_in_progress\\", "S:\\Digital\ Projects\\Digital_Coll_Complete\\"); print "Which directory do you want? Type the number, then press enter:\n"; $basecount = scalar @bases; for ($i = 1; $i <= $basecount; $i ++){ print $i.") ".$bases[$i-1]."\n"; } $num = ; chop $num; $num --; $base = $bases[$num]; opendir(BASE, $base) or die "can't open $base\n"; while ($file = readdir(BASE)){ # print "looking at $file\n"; if ($file =~ /^\./){next;} # skip dot files $path = $base.$file; if (-d $path){ push (@dirs, $file); } } close(BASE); print "I'm looking into $base. \n Here are the directories there. \n"; $dircount = scalar @dirs; for ($i = 1; $i <= $dircount; $i ++){ print $i.") ".$dirs[$i-1]."\n"; } #foreach (@dirs){ print "$_\n";} print "\nWhich collection directory do you want? \n Type the number... if this is the \n"; print "final destination directory, type ! also \n -- then press enter:\n"; $num = ; chop $num; if ($num =~ /^(\d+)\!/){ $num = $1; $herePlease = 1; } $num --; $mybase = $base.$dirs[$num]."\\"; $thisdir = $mybase; if (! $herePlease){ $myval = 1; while ($myval eq 1){ opendir(THIS, $thisdir) or die "can't open $thisdir\n"; undef @dirs; while ($file = readdir(THIS)){ if ($file =~ /^\./){next;} # skip dot files $path = $thisdir.$file; if (-d $path){ push (@dirs, $file); } } close(THIS); $dircount = scalar @dirs; if ($dircount == 0){ print "There are no directories below this. Shall I test files here?\n"; print "Y or N, and press enter:\n"; $answer = ; if ($answer =~ /n/i){ print "Bye Bye then!\n"; exit;} last;# break the loop } print "I'm testing $thisdir. \n"; print "\nDid we go down one directory too far?\n"; print "Y or N, and press enter\n"; $answer = ; if ($answer =~ /y/i){ $thisdir =~ s,^(.*\\).*?\\$,\1,; opendir(THIS, $thisdir) or die "can't open $thisdir\n"; undef @dirs; while ($file = readdir(THIS)){ if ($file =~ /^\./){next;} # skip dot files $path = $thisdir.$file; if (-d $path){ push (@dirs, $file); } } close(THIS); $dircount = scalar @dirs; } print " Here are the directories there. \n"; for ($i = 1; $i <= $dircount; $i ++){ print $i.") ".$dirs[$i-1]."\n"; } print "\nWhich directory do you want? \n Type the number \n AND if this is the "; print "destination directory, type ! \n -- then press enter:\n"; $num = ; if ($num =~ /^(\d+)\!/){ $num = $1; $myval = 2; #break the loop } $num --; $thisdir.= $dirs[$num]."\\"; } } print "Files will be tested in $thisdir\n"; print "\nDid we go down one directory too far?\n"; print "Y or N, and press enter\n"; $answer = ; if ($answer =~ /y/i){ $thisdir =~ s,^(.*\\).*?\\$,\1,; $mybase = $thisdir; } else{ $mybase = $thisdir;} print "Files will be tested in $mybase\n"; # test here to see if this directory has filename in it... # if not, ask for file name structure to test undef $childbase; undef $coll; if ($thisdir =~ /\\([a-z]{1}\d{4}\_\d{7}(\_\d{7})?)/){ $coll = $1; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; while ($answer =~ /n/i){ undef $coll; print "Please enter the collection identifier\n"; $coll = ; chop $coll; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; } } if (!$coll){ print "Please enter the collection identifier\n"; $coll = ; chop $base; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; while ($answer =~ /n/i){ undef $coll; print "Please enter the collection identifier\n"; $coll = ; chop $coll; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; } } print "\nWe will test the files against collection identifier $coll.\n\n"; print ". . . working . . . (be patient) . . . \n\n"; push (@mydirs, $mybase."\\"); # start with home base foreach $dir (@mydirs){ $count = 1; # checking sequence count of files in directories opendir (DIR, $dir) or die "can't read files in $dir\n"; while ($adir = readdir(DIR)){ if ($adir =~ /^\./ || $adir =~ /Thumbs\.db/i){next;} #skip dot files and thumbs.db $thispath = $dir."\\".$adir; if ($adir =~ /scans/i && (-d $thispath)){ push (@mydirs, $thispath); next; print "$thispath is a directory\n"; } # print "looking at $adir\n"; ($short = $dir) =~ s,S\:\\Digital Projects\\,,; $short =~ s,Digital_Coll_in_progress\\,,; $short =~ s,Digital_Coll_Complete\\,,; ($parent = $short) =~ s,.*\\,,; #($parent = $short) =~ s,.*\\(\w)\\\$,\1,; # parent directory is the value before the last slash # we want to match for this if ($adir =~ /Scans/i){ push (@mydirs, $thispath); next; } elsif ($adir =~ /Transcripts/i){ push (@transdirs, $thispath); } elsif ($adir =~ /Admin/){ opendir(ADMIN, $thispath) or die "can't look through $thispath\n"; undef $found; while ($afile = readdir(ADMIN)){ if ($afile =~ /^\./){ next;} if ($afile =~ /^$coll\.xml$/){ $found = 1; } elsif ($afile =~ /^$coll\.\d{1,2}\.xml$/){ $found = 1; } } if (!$found){ push (@missing, "$coll.xml missing from Admin folder\n");} close(ADMIN); } elsif ($adir =~ /Metadata/){ opendir(MD, $thispath) or die "can't look through $thispath\n"; undef $found; while ($afile = readdir(MD)){ if ($afile =~ /^\./){ next;} if ($afile =~ /^$coll\.txt$/){ $found = 1; } elsif ($afile =~ /^$coll\.\d{1,2}\.txt$/){ $found = 1; } } if (!$found){ push (@missing, "$coll.txt missing from Metadata folder\n");} close(MD); } # here, going down into scans directories elsif ($adir =~ /$coll/){ # passes first test, it matches collname # first, capture entire filename if a tiff or mp3 if ($adir =~ /^(.*?)\.tif/i || $adir =~ /^(.*?)\.mp3/i){ push (@scanNums, $1); } ($testpath = $thispath) =~ s,\\$,,; # print "testing $testpath to see if it's a directory, for $thispath\n"; if ( -d $testpath){ #print "FOUND a directory: $testpath\n"; if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?$/ )){ # print "-->$adir<-- directory does not match expected pattern for a directory\n"; push (@badform, $adir); } if ($adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}$/){ push (@items, $adir); } push (@mydirs, $thispath); # collect subdirectories for further investigation } else{ # not a directory; must be a file. Does it match its parent directory? if ((!($parent =~ /Scans/i)) && (!($adir =~ /$parent/))){ push (@wrongdir, $adir." in ".$short); if ($adir =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7})\..*/){ # leave off extension push (@items, $adir); } } if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?\.[\w]{2,4}$/)){ # print "-->$adir<-- directory does not match expected pattern for a file\n"; push (@badform, $adir); } # pull off the last set of numbers before the extension elsif ($parent =~ /Scans/i || (!$parent)){ if($adir =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7})\..*/){ # leave off extension push (@items, $adir); } } elsif ($adir =~ /.*\_(\d{3,7})\./) { # check for bad numbering ONLY on files named correctly $mynum = $1 + 0; # get rid of leading zeros if ($mynum != $count){ push (@badcount, $adir." ".$count); } $count ++; if ($adir =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7})\..*/){ # leave off extension push (@items, $adir); } } } } else { push (@wrongdir, $adir." ".$short);} } close(DIR); } # we have @scanNums containing filenames of all tiffs in scans directories; # we have transcript directories in @transdirs # our task now is to collect all the filenames here, but pay attention to extensions # locate all the bad filenames # things in the wrong directory # if it is not a tiff, # checks to ensure that there is a tiff for each -- if not in same directory, then in # one of the scans directories # checks to make sure we don't have BOTH a .txt AND a .ocr.txt for the file # OK to have tiff and txt or ocr.txt # if Transcripts contains both tiffs and txt or ocr.txt files, then must have a one-to-one # correspondence between them. undef @ocr; undef @txt; undef @transtifs; # print "now looking at transcript directories\n"; foreach $dir (@transdirs){ opendir (DIR, $dir) or die "can't look through $dir\n"; while ($adir = readdir(DIR)){ $path = $dir.$adir; if ($adir =~ /^\./){ next;} # no dot files # print "looking at $adir\n"; # first, capture entire filename if a tiff, ocr, or corrected txt if ($adir =~ /^(.*?)\.tif/i){ push (@transtifs, $1); } elsif ($adir =~ /^(.*?)\.ocr\.txt/){ push (@ocr, $1); } elsif ($adir =~ /^(.*?)\.txt/){ push (@txt, $1); } elsif (-d $path){ push (@transdirs, $dir.$adir."\\");} # keep going down } close(DIR); } # first, let's see if we have both txt and .ocr.txt for anything undef @multiples; undef @notif; foreach $o (@ocr){ undef $found; foreach $t (@txt){ if ($o eq $t){ push (@multiples, $o);} } # while we're at it, let's look for tiffs for the OCR foreach $tif (@scanNums){ if ($o eq $tif){ $found = 1;} } foreach $tif (@transtifs){ if ($o eq $tif){ # the following for when it is ready for deposit. if ($found == 1){ print OUT "ERROR! $tif exists as a tiff both in transcripts AND in scans!!\n";} $found = 1; } } if (!$found){ push (@notif, "OCR: $o");} } # now let's look for tiffs for the txt files foreach $t (@txt){ undef $found; foreach $tif (@scanNums){ if ($t eq $tif){ $found = 1;} } foreach $tif (@transtifs){ if ($t eq $tif){ # the following for when it is ready for deposit. if ($found == 1){ print OUT "ERROR! $tif exists as a tiff both in transcripts AND in scans!!\n";} $found = 1; } } if (!$found){ push (@notif, "TXT: $t");} } # finally, let's check to see if all transcript tiffs have corresponding scan tiffs foreach $tt (@transtifs){ undef $found; foreach $st (@scanNums){ if ($tt eq $st){ $found = 1;} } if (!$found){ push (@notif, "Transcript TIFF: $tt");} } print OUT "Anything in bad form was NOT counted as an item,\n though it WAS counted if in the wrong directory\n\n"; foreach $item (@items){ if ($item =~ /\_([\d]{7})(\.\w{2,5})?$/){ $itemnum = $1 + 0; # from text to number, discard padding zeros if (! $num{$itemnum}){ $num{$itemnum} = $item;} else{ push (@dupelist, $item);} } else{ print "ERROR: unable to determine item number for $item\n";} } if (@dupelist){ print OUT "\n\nThese items or item directories are duplicated in the file system:\n"; foreach (@dupelist){ print OUT "_\n";} print OUT "\n\n"; } @sorted = sort by_number (keys(%num)); print OUT "I counted ".scalar(@sorted)." distinct items total\n"; $last = $sorted[0] -1; foreach $number (@sorted){ $gap = $number - $last; if ($gap > 1){ $gapBeforeMe{$number} = $gap -1; } # one file missing if gap == 2 $last = $number; } if (%gapBeforeMe){ print OUT "Item number: \t Number of items missing before it:\n"; @gaplist = sort by_number keys (%gapBeforeMe); foreach $number (@gaplist){ print OUT $num{$number}." \t ".$gapBeforeMe{$number}."\n"; } print OUT "\n\n"; } if (@badform || @badcount || @wrongdir || @missing || @multiples || @notif){ print OUT "\nTROUBLE: $coll \n"; print OUT "-------------------------------------------\n"; print "Please check the output of this script in $output \n"; } else{ print OUT "All is GREAT!! GOOD WORK!! :-) \n"; } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; print OUT "Are they in the right place? Please check:\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nThe following filenames were expected to be the sequence number\n"; print OUT "that follows them:\n"; foreach (@badcount){ print OUT " $_\n";} } if (@missing){ print OUT "\nThe following files are missing or badly named \n"; foreach (@missing){ print OUT " $_\n";} } if (@multiples){ print OUT "\nYou have both OCR and corrected text for the following:\n"; foreach (@multiples){ print OUT " $_\n";} } if (@notif){ print OUT "\nYou have no matching tiff for each of the following:\n"; foreach (@notif){ print OUT " $_\n";} } close OUT; print "Good bye!\n"; sleep(5); exit; sub timestamp{ print "hit enter twice please\n"; $date = `date`; $time = `time`; #print $date."\n"; if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ $date = $3.$1.$2; } #print $date."\n"; if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ $time = $1.$2.$3; } #print $time."\n"; $timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix #($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime($mydate); #$mon ++; #if ($mon < 10){ $mon="0".$mon;} #need 2 digits #if ($sec < 10){ $sec="0".$sec;} #if ($min < 10){ $min="0".$min;} #if ($hour < 10){ $hour="0".$hour;} #if ($mday < 10){ $mday="0".$mday;} #$year = $year + 1900; #$timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; } sub by_number {$a <=> $b;}