#!/usr/bin/perl use File::Copy; use Time::Local; # moreTranscriptOCR # jody DeRidder, 6/9/10 ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 6/10/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # should ocr not exist for existing transcripts in the # archive, acumen, or the deposits directory # this script will provide the option to create OCR for all of # the transcripts for any collection indicated, or # for only the content provided in an ocrList.txt file # -- which it will look for in the completed directory on the share drive # ocr files will be deposited directly into Acumen, and will NOT be # made for anything besides transcript images, and will NOT # be made if an existing ocr or txt file exists in acumen in a transcripts # directory, for the image. ×tamp; # location hardcoded here: $inbase = "/cifs-mount/Digital_Coll_Complete/"; $acumen = "/srv/www/htdocs/content/"; $archive = "/srv/archive/"; $depositArea = "/srv/deposits/content/"; print "\n\nDo you want to create OCR for all the transcripts in a particular collection?\n\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ GETNUM: print "\n\nPlease enter the collection number:\n\n"; $collnum = ; chomp $collnum; print "\n\nIs this the correct number? -->$collnum<-- \n\n"; $ans = ; chomp ($ans); if ($ans =~ /n/i){ goto GETNUM; } } else{ print "\n\nI\'m Looking for an ocrList.txt file in the Completed folder.\n Please help me locate it.\n\n"; opendir(BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ # print "looking at $file\n"; if ($file =~ /^\./){next;} # skip dot files $path = $inbase.$file; if (-d $path){ push (@dirs, $file); } } close(BASE); print "\n\nI'm looking through $inbase. \n Here are the directories there. \n\n"; $dircount = scalar @dirs; for ($i = 1; $i <= $dircount; $i ++){ print $i.") ".$dirs[$i-1]."\n"; } print "\nWhich directory do you want? \n Type the number and press enter:\n\n"; $num = ; chop $num; $num --; $inbase = $inbase.$dirs[$num]."/"; push (@findFile, $inbase); undef @dirs; foreach $dir (@findFile){ print "\nI'm looking through $dir; \nis this the exact directory with the ocrList.txt?\n Y or N: \n\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $ocrlist = $dir."ocrList.txt"; last; } else{ print "\n\nOkay, then, I'll collect the subdirectories, and we'll look through those\n"; opendir(DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){next;} # skip dot files $path = $dir.$file; if (-d $path){ push (@findFile, $path."/"); } } close(DIR); } } if (! -e $ocrlist){ print "ERROR! I did not find $ocrlist. Please start over.\n\n"; exit; } else{ TRYAGAIN: open (OCR, $ocrlist) or die "can't open $ocrlist\n"; while ($line = ){ if ($line =~ /(([a-z]{1}\d{4}\_\d{7})\_\d{7}.*?)\t *1/){ # print "Found line $line\n"; push (@ocrMe, $1); $coll = $2; undef $found; foreach $c (@collList){ if ($c eq $coll){ $found = 1;} } if (!$found){ push (@collList, $coll);} # compile list of all collections we are working on } elsif (!($line =~ /[a-z]{1}\d{4}\_\d{7}\_\d{7}/)){ if ($line =~ /\w/){ # there's something there, but I don't know what print OUT "NO OCR performed for $line in ocrList.txt file \n"; print "NO OCR performed for $line in ocrList.txt file \n"; } } } close(OCR); if (! @ocrMe){ print "ERROR! I could not locate any items in ocrList.txt to OCR\n"; print "Please check file names in the list, and encoding of the file\n"; print "ANSI text file is best. I can't read UTF-16.\n\n"; print "When you have replaced the file, please hit enter\n"; $ans = ; goto TRYAGAIN; } } } # at this point we either have a list in @ocrMe or we have a collection number to OCR # output file will be based on collection number and timestamp $output = "/home/jeremiah/AUDIO/output/TranscriptOCRmaker_".$timestamp.".txt"; $out = "AUDIO/output/TranscriptOCRmaker_".$timestamp.".txt"; open (OUT, ">".$output) or die "can't open $output\n"; # call another process to make the ocr so it won't stop when the ssh window closes. print "\n\nWe are starting to make the OCR now\n"; unless (fork){ # the child process does this stuff # if we have to OCR content already uploaded but not in the archive, we'd best find those files opendir(DIR, $depositArea) or die "can't look in $depositArea\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next; } #skip dot files $path = $dir.$file; if (-d $path){ if ($collnum && $file =~ /$collnum/){ # top level directory should contain collection number push (@hunting, $path."/"); } else{ # check if it matches any of the collections in our ocrList undef $found; foreach $c (@collList){ if ($file =~ /$c/){ $found = 1; } } if ($found){push (@hunting, $path."/");} } } } close(DIR); foreach $dir (@hunting){ opendir(DIR, $dir) or die "can't look in $dir\n"; while ($file = readdir(DIR)){ # this hunts for transcripts only if ($file =~ /^\./ || $file =~ /Metadata/ || $file =~ /Admin/ || $file =~ /Scans/i){ next; } #skip dot files $path = $dir.$file; if (-d $path){ push (@hunting, $path."/"); } elsif ($file =~ /(.*?)\/.tif/){ $which = $1; $deposits{$which} = $path; # keep it for reference later } } close(DIR); } @inDeposits = keys (%deposits); # we don't want to create OCR for anything not online. if (@ocrMe){ # use the filenumbers to find the files. foreach $val (@ocrMe){ ($mydirs = $val) =~ s,\_,\/,g; # substitute slashes for underscores in filename $mybase = $acumen.$mydirs."/"; if (-e $mybase){ push (@dothese, $mybase); } } } else{ ($mydirs = $collnum) =~ s,\_,\/,g; # substitute slashes for underscores in filename $mybase = $acumen.$mydirs."/"; if (-e $mybase){ push (@dothese, $mybase); } } foreach $val (@dothese){ opendir(ITEM, $val) or die "can't open $val\n"; while ($file = readdir(ITEM)){ if ($file =~ /^\./ || $file =~ /Metadata/){ next;} $path = $val.$file; if (-d $path){ push (@dothese, $path."/"); next;} elsif ($file =~ /^(.*?)\_2048\.jpg/){ $thisnum = $1; # print "found $file -- need an ocr for $thisnum\n"; # are we in a transcripts directory already? if ($val =~ /Transcripts/){ # if so, then the text would be in this same directory $anocr = $val.$thisnum.".ocr.txt"; $atext = $val.$thisnum.".txt"; $ocrOut = $val.$thisnum.".ocr"; undef $newtrans; } else { # text would be in a Transcripts subdirectory $anocr = $val."/Transcripts/".$thisnum.".ocr.txt"; $atext = $val."/Transcripts/".$thisnum.".txt"; $ocrOut = $val."/Transcripts/".$thisnum.".ocr"; $newtrans = $val."/Transcripts"; } # print "anocr: $anocr\natext: $atext\nocrOut: $ocrOut\n newtrans: $newtrans\n"; if ( (-e $anocr) || (-e $atext)){ next;} # no need to ocr this else{ # we need to find the tiff. ($mydirs = $thisnum) =~ s,\_,\/,g; # substitute slashes for underscores in filename # check for transcript tiff in archive first # if it exists, use that -- don't use the other for this script. $tiff = $archive.$mydirs."/Transcripts/".$thisnum.".tif"; # print "the tiff we need is $tiff\n"; if ( ! -e $tiff){ # let's look in the deposits directory foreach $d (@inDeposits){ if ($d eq $thisnum){ $tiff = $deposits{$d}; } } } # try again if ( ! -e $tiff){ print OUT "ERROR: Can't create OCR for $thisnum; $tiff does not exist in a Transcripts directory\n";} else{ if ($newtrans){ # need a transcripts directory if (! -e $newtrans){ `mkdir -m 0755 $newtrans`;} # create a directory } $check = `tesseract $tiff $ocrOut`; if ($check){ print OUT "ERROR $check creating ocr for $filenum\n";} } } } # ignore thumbnails and everything else } close(ITEM); } print OUT "OCR creation is complete. Please check Acumen.\n"; print OUT "\n\nGREAT WORK!!! :-)\n\n"; close(OUT); exit; } # parent process gives a little more info, then exits print "Please check back in a few hours\n"; print "Any problems found will be in the \n$out file.\n"; print "\n\nThank you!!\n\n"; exit; sub timestamp{ #following for Windows #print "hit enter twice please\n"; #$date = `date`; #$time = `time`; #print $date."\n"; #if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ # $date = $3.$1.$2; # } #print $date."\n"; #if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ # $time = $1.$2.$3; # } #print $time."\n"; #$timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); ##$year += 1900; $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; } sub by_number {$a <=> $b;}