#!C:\Perl64\bin\perl.exe use Time::Local; # filenamesAndTranscripts # locate all the bad filenames # things in the wrong directory # if it is not a tiff, # checks to ensure that there is a tiff for each -- if not in same directory, then in # one of the scans directories # checks to make sure we don't have BOTH a .txt AND a .ocr.txt for the file # OK to have tiff and txt or ocr.txt # if Transcripts contains both tiffs and txt or ocr.txt files, then must have a one-to-one # correspondence between them. # # jody DeRidder, 4/23/10 ×tamp; #print $timestamp."\n"; $output = "..\\output\\BadFileNames".$timestamp.".txt"; open (OUT, ">".$output) or die "can't open $output\n"; print "Results will be in the $output file....\n when you see a \"Good bye!\" and this screen closes.\n"; print "Don't open it yet! :-) \n\n"; @bases = ("S:\\Digital\ Projects\\Digital_Coll_in_progress\\", "S:\\Digital\ Projects\\Digital_Coll_Complete\\"); print "Which directory do you want? Type the number, then press enter:\n"; $basecount = scalar @bases; for ($i = 1; $i <= $basecount; $i ++){ print $i.") ".$bases[$i-1]."\n"; } $num = ; chop $num; $num --; $base = $bases[$num]; opendir(BASE, $base) or die "can't open $base\n"; while ($file = readdir(BASE)){ # print "looking at $file\n"; if ($file =~ /^\./){next;} # skip dot files $path = $base.$file; if (-d $path){ push (@dirs, $file); } } close(BASE); print "I'm looking into $base. \n Here are the directories there. \n"; $dircount = scalar @dirs; for ($i = 1; $i <= $dircount; $i ++){ if ($dirs[$i-1] =~ /\w/){ print $i.") ".$dirs[$i-1]."\n"; } } #foreach (@dirs){ print "$_\n";} print "\nWhich directory do you want? \n Type the number -- then press enter:\n"; $num = ; chop $num; if ($num =~ /^(\d+)/){ $num = $1; $herePlease = 1; } $num --; $mybase = $base.$dirs[$num]."\\"; if ($mybase =~ /[a-z]{1}[\d]{4}\_[\d]{7}/){ $thisdir = $mybase; } else{ opendir(THIS, $mybase) or die "can't open $mybase\n"; undef @dirs; while ($file = readdir(THIS)){ if ($file =~ /^\./){next;} # skip dot files $path = $mybase.$file; if (-d $path){ push (@dirs, $file); } } close(THIS); $dircount = scalar @dirs; if ($dircount == 0){ print "\nno directories found here!\n Please start over\n\n"; exit;} # break the loop else{ print " Here are the directories there. \n"; for ($i = 1; $i <= $dircount; $i ++){ if ($dirs[$i-1] =~ /\w/){ print $i.") ".$dirs[$i-1]."\n"; } } print "\nWhich directory do you want? \n Type the number \n" ; print "-- then press enter:\n"; $num = ; if ($num =~ /^(\d+)\!/){ $num = $1; } $num --; $mybase .= $dirs[$num]."\\"; $thisdir = $mybase; } } print "Files will be tested in $thisdir\n"; print "If that is incorrect, please hit Control C to exit\n"; #print "parentDir is $parentDir\n"; # here we collect all the scans directories so we can look for duplicates. opendir(MOM, $thisdir) or die "can't open $thisdir\n"; while ($file = readdir(MOM)){ if ($file =~ /^\./){ next; } # skip dot files if ($file =~ /scans/i){ $tryme = $thisdir."\\".$file; if (-d $tryme){ # if it's a scans directory, collect it # print "collecting $tryme for testing\n"; push (@allScanDirs, $tryme); } } } # test here to see if this directory has filename in it... # if not, ask for file name structure to test undef $childbase; undef $coll; if ($thisdir =~ /\\([a-z]{1}\d{4}\_\d{7}(\_\d{7})?)/){ $coll = $1; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; while ($answer =~ /n/i){ undef $coll; print "Please enter the collection identifier\n"; $coll = ; chop $coll; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; } } if (!$coll){ print "Please enter the collection identifier\n"; $coll = ; chop $base; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; while ($answer =~ /n/i){ undef $coll; print "Please enter the collection identifier\n"; $coll = ; chop $coll; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; } } # this tells us whether to hunt for child files for which the item has an image already print "\nDoes this collection consist solely of scrapbook-type items?\n"; print "Y or N, and press enter\n"; $answer = ; if ($answer =~ /y/i){ $scrapbooks = 1;} if (@allScanDirs){ print "\nFirst we will look for duplicates across all scans directories found \n\n"; print ". . . working . . . (be patient) . . . \n\n"; foreach $dir (@allScanDirs){ opendir(DIR, $dir) or die "can't open $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next; } # skip dot files if ($file =~ /\.tif/){ # got one. Parse it ($fileID = $file) =~ s,\.tiff?$,,; # take off the extension #print "fileID for $file is $fileID\n"; undef $pageID; undef $subpageID; if ($fileID =~ /^([^\_]*?\_[^\_]*?\_[^\_]*?)((\_[^\_]*?)(\_[^\_]*?)?)?$/){ # print "found $fileID\n"; $itemID = $1; if ($2){ $pageID = $1.$3 } if ($4){ $subpageID = $fileID;} push (@{$loc{$fileID}}, $dir); # create hash array keyed on fileID, with array of directories where found if (!$scrapbooks){ # also collect itemkids and pagekids keyed on item and page, for checking if ($subpageID){ push (@{$itemkids{$itemID}}, $subpageID); push (@{$itemkids{$pageID}}, $subpageID); } elsif ($pageID){ push (@{$itemkids{$itemID}}, $pageID); # collect page } } } else{ print OUT "ERROR: $file in $dir is NOT an item, or has underscore issues\n";} } else{ $path = $dir."\\".$file; if (-d $path){ push (@allScanDirs, $path);} # if a directory, keep going down } } close (DIR); } # look for more than one directory for any file first @fileIDs = keys (%loc); foreach $id (@fileIDs){ @myloc = @{$loc{$id}}; if ($myloc[1]){ if (!$header){ print OUT "\nDUPLICATES FOUND\n_____________________\n\n"; $header = 1; } print OUT "$id found in multiple directories:\n"; foreach $l (@myloc){ print OUT "\t$l\n";} } if ($itemkids{$id}){ if (!$header){ print OUT "\nDUPLICATES FOUND\n_____________________\n\n"; $header = 1; } print OUT "$id has images at multiple levels:\n"; @myloc = @{$loc{$id}}; foreach $l (@myloc){ print OUT "\t$id: $l\n";} @mykids = @{$itemkids{$id}}; foreach $kid (@mykids){ @myloc = @{$loc{$kid}}; foreach $l (@myloc){ print OUT "\t$kid: $l\n";} } } } } else { print "no scans directories found in directory above the one you selected. No Duplicate Checking done!!!\n\n";} # now check for items at page level @itemList = sort keys (%itemkids); foreach $itemID (@itemList){ @mykids = @{$itemkids{$itemID}}; $numkids = scalar(@mykids); if ($numkids == 1){ # if only one child to the item, check the numbering $child = $mykids[0]; if ($child =~ /^[^\_]*?\_[^\_]*?\_[^\_]*?\_[^\_]*?$/ || $child =~ /^[^\_]*?\_[^\_]*?\_[^\_]*?\_[^\_]*?\_[^\_]*?$/){ push (@badkids, $itemID); } } } print "\nWe will test the files against collection identifier $coll.\n\n"; print ". . . working . . . (be patient) . . . \n\n"; push (@mydirs, $mybase."\\"); # start with home base foreach $dir (@mydirs){ $count = 1; # checking sequence count of files in directories opendir (DIR, $dir) or die "can't read files in $dir\n"; while ($adir = readdir(DIR)){ if ($adir =~ /^\./ || $adir =~ /Thumbs\.db/i){next;} #skip dot files and thumbs.db $thispath = $dir."\\".$adir; if ($adir =~ /scans/i && (-d $thispath)){ push (@mydirs, $thispath); next; print "$thispath is a directory\n"; } # print "looking at $adir\n"; ($short = $dir) =~ s,S\:\\Digital Projects\\,,; $short =~ s,Digital_Coll_in_progress\\,,; $short =~ s,Digital_Coll_Complete\\,,; ($parent = $short) =~ s,.*\\,,; #($parent = $short) =~ s,.*\\(\w)\\\$,\1,; # parent directory is the value before the last slash # we want to match for this if ($adir =~ /Scans/i){ push (@mydirs, $thispath); next; } elsif ($adir =~ /Transcripts/i){ push (@transdirs, $thispath); } elsif ($adir =~ /Admin/){ opendir(ADMIN, $thispath) or die "can't look through $thispath\n"; undef $found; while ($afile = readdir(ADMIN)){ if ($afile =~ /^\./){ next;} if ($afile =~ /^$coll\.xml$/){ $found = 1; } elsif ($afile =~ /^$coll\.\d{1,2}\.xml$/){ $found = 1; } } if (!$found){ push (@missing, "$coll.xml missing from Admin folder\n");} close(ADMIN); } elsif ($adir =~ /Metadata/){ opendir(MD, $thispath) or die "can't look through $thispath\n"; undef $found; while ($afile = readdir(MD)){ if ($afile =~ /^\./){ next;} if ($afile =~ /^$coll\.txt$/){ $found = 1; } elsif ($afile =~ /^$coll\.\d{1,2}\.txt$/){ $found = 1; } } # if (!$found){ push (@missing, "$coll.txt missing from Metadata folder\n");} close(MD); } # here, going down into scans directories elsif ($adir =~ /$coll/){ # passes first test, it matches collname # first, capture entire filename if a tiff or mp3 if ($adir =~ /^(.*?)\.tif/i || $adir =~ /^(.*?)\.mp3/i){ push (@scanNums, $1); } ($testpath = $thispath) =~ s,\\$,,; # print "testing $testpath to see if it's a directory, for $thispath\n"; if ( -d $testpath){ #print "FOUND a directory: $testpath\n"; if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?$/ )){ # print "-->$adir<-- directory does not match expected pattern for a directory\n"; push (@badform, $adir); } if ($adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}$/){ push (@items, $adir); } push (@mydirs, $thispath); # collect subdirectories for further investigation } else{ # not a directory; must be a file. Does it match its parent directory? if ((!($parent =~ /Scans/i)) && (!($adir =~ /$parent/))){ push (@wrongdir, $adir." in ".$short); if ($adir =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7})\..*/){ # leave off extension push (@items, $adir); } } if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?\.[\w]{2,4}$/)){ # print "-->$adir<-- directory does not match expected pattern for a file\n"; push (@badform, $adir); } # pull off the last set of numbers before the extension elsif ($parent =~ /Scans/i || (!$parent)){ if($adir =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7})\..*/){ # leave off extension push (@items, $adir); } } elsif ($adir =~ /.*\_(\d{3,7})\./) { # check for bad numbering ONLY on files named correctly $mynum = $1 + 0; # get rid of leading zeros if ($mynum != $count){ push (@badcount, $adir." ".$count); } $count ++; if ($adir =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7})\..*/){ # leave off extension push (@items, $adir); } } } } else { push (@wrongdir, $adir." ".$short);} } close(DIR); } # we have @scanNums containing filenames of all tiffs in scans directories; # we have transcript directories in @transdirs # our task now is to collect all the filenames here, but pay attention to extensions # locate all the bad filenames # things in the wrong directory # if it is not a tiff, # checks to ensure that there is a tiff for each -- if not in same directory, then in # one of the scans directories # checks to make sure we don't have BOTH a .txt AND a .ocr.txt for the file # OK to have tiff and txt or ocr.txt # if Transcripts contains both tiffs and txt or ocr.txt files, then must have a one-to-one # correspondence between them. undef @ocr; undef @txt; undef @transtifs; # print "now looking at transcript directories\n"; foreach $dir (@transdirs){ opendir (DIR, $dir) or die "can't look through $dir\n"; while ($adir = readdir(DIR)){ $path = $dir.$adir; if ($adir =~ /^\./){ next;} # no dot files # print "looking at $adir\n"; # first, capture entire filename if a tiff, ocr, or corrected txt if ($adir =~ /^(.*?)\.tif/i){ push (@transtifs, $1); } elsif ($adir =~ /^(.*?)\.ocr\.txt/){ push (@ocr, $1); } elsif ($adir =~ /^(.*?)\.txt/){ push (@txt, $1); } elsif (-d $path){ push (@transdirs, $dir.$adir."\\");} # keep going down } close(DIR); } # first, let's see if we have both txt and .ocr.txt for anything undef @multiples; undef @notif; foreach $o (@ocr){ undef $found; foreach $t (@txt){ if ($o eq $t){ push (@multiples, $o);} } # while we're at it, let's look for tiffs for the OCR foreach $tif (@scanNums){ if ($o eq $tif){ $found = 1;} } foreach $tif (@transtifs){ if ($o eq $tif){ # the following for when it is ready for deposit. if ($found == 1){ print OUT "ERROR! $tif exists as a tiff both in transcripts AND in scans!!\n";} $found = 1; } } if (!$found){ push (@notif, "OCR: $o");} } # now let's look for tiffs for the txt files foreach $t (@txt){ undef $found; foreach $tif (@scanNums){ if ($t eq $tif){ $found = 1;} } foreach $tif (@transtifs){ if ($t eq $tif){ # the following for when it is ready for deposit. if ($found == 1){ print OUT "ERROR! $tif exists as a tiff both in transcripts AND in scans!!\n";} $found = 1; } } if (!$found){ push (@notif, "TXT: $t");} } # finally, let's check to see if all transcript tiffs have corresponding scan tiffs foreach $tt (@transtifs){ undef $found; foreach $st (@scanNums){ if ($tt eq $st){ $found = 1;} } if (!$found){ push (@notif, "Transcript TIFF: $tt");} } print OUT "\n\nAnything in bad form was NOT counted as an item,\n though it WAS counted if in the wrong directory\n\n"; foreach $item (@items){ if ($item =~ /\_([\d]{7})(\.\w{2,5})?$/){ $itemnum = $1 + 0; # from text to number, discard padding zeros if (! $num{$itemnum}){ $num{$itemnum} = $item;} else{ push (@dupelist, $item);} } else{ print "ERROR: unable to determine item number for $item\n";} } #if (@dupelist){ # print OUT "\n\nThese items or item directories are duplicated in the file system:\n"; # foreach (@dupelist){ print OUT "$_\n";} # print OUT "\n\n"; # } @sorted = sort by_number (keys(%num)); print OUT "I counted ".scalar(@sorted)." distinct items total\n"; $last = $sorted[0] -1; foreach $number (@sorted){ $gap = $number - $last; if ($gap > 1){ $gapBeforeMe{$number} = $gap -1; } # one file missing if gap == 2 $last = $number; } if (%gapBeforeMe){ print OUT "Item number: \t Number of items missing before it:\n"; @gaplist = sort by_number keys (%gapBeforeMe); foreach $number (@gaplist){ print OUT $num{$number}." \t ".$gapBeforeMe{$number}."\n"; } print OUT "\n\n"; } if (@badform || @badcount || @wrongdir || @missing || @multiples || @notif || @badkids){ print OUT "\nTROUBLE: $coll \n"; print OUT "-------------------------------------------\n"; print "Please check the output of this script in $output \n"; } else{ print OUT "All is GREAT!! GOOD WORK!! :-) \n"; } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; print OUT "Are they in the right place? Please check:\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badkids){ print OUT "\nThe following items or pages have a single image with too many segments in the file name\n"; print OUT "(such as an item at page level!!! Don't do that!!)\n"; foreach (@badkids){ print OUT " $_\n";} } if (@badcount){ print OUT "\nThe following filenames were expected to be the sequence number\n"; print OUT "that follows them:\n"; foreach (@badcount){ print OUT " $_\n";} } if (@missing){ print OUT "\nThe following files are missing or badly named \n"; foreach (@missing){ print OUT " $_\n";} } if (@multiples){ print OUT "\nYou have both OCR and corrected text for the following:\n"; foreach (@multiples){ print OUT " $_\n";} } if (@notif){ print OUT "\nYou have no matching tiff for each of the following:\n"; foreach (@notif){ print OUT " $_\n";} } close OUT; print "Good bye!\n"; print "You can close this window now.\n"; sleep(5); exit; sub timestamp{ print "hit enter twice please\n"; $date = `date`; $time = `time`; #print $date."\n"; if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ $date = $3.$1.$2; } #print $date."\n"; if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ $time = $1.$2.$3; } #print $time."\n"; $timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix #($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime($mydate); #$mon ++; #if ($mon < 10){ $mon="0".$mon;} #need 2 digits #if ($sec < 10){ $sec="0".$sec;} #if ($min < 10){ $min="0".$min;} #if ($hour < 10){ $hour="0".$hour;} #if ($mday < 10){ $mday="0".$mday;} #$year = $year + 1900; #$timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; } sub by_number {$a <=> $b;}