#!/usr/bin/perl use File::Copy; use Time::Local; # makeJpegs # jody DeRidder, 5/3/10 # expects share drive content under /cifs-mount/ # requests collection directory on share drive # locates all Scans directories and Transcript directories # creates jpegs and dumps them into /home/jeremiah/UploadArea/jpegs # for testing and distribution into Acumen. # will make another version of this to pull content from share # into deposits directory, verify the copies, and then delete on share. ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 6/10/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ×tamp; # location hardcoded here: $inbase = "/cifs-mount/Digital_Coll_Completed/"; $outbase = "/home/jeremiah/UploadArea/"; $acumen = "/srv/www/htdocs/content/"; $archive = "/srv/archive/"; $depositArea = "/srv/deposits/content/"; # testing #$outbase = "/home/jlderidder/UploadArea/"; opendir(BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ # print "looking at $file\n"; if ($file =~ /^\./){next;} # skip dot files $path = $inbase.$file; if (-d $path){ push (@dirs, $file); } } close(BASE); print "\n\nI'm looking through $inbase. \n Here are the directories there. \n\n"; $dircount = scalar @dirs; for ($i = 1; $i <= $dircount; $i ++){ print $i.") ".$dirs[$i-1]."\n"; } print "\nWhich directory do you want? \n Type the number and press enter:\n\n"; $num = ; chop $num; $num --; $inbase = $inbase.$dirs[$num]."/"; undef $here; while (!$here){ print "\nI'm looking through $inbase; \nis this the collection directory?\n Y or N: \n\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $here = 1; last; } else{ print "\nShould I go back up a directory level?\n Y or N: \n\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $inbase =~ s,^(.*\/).*?\/$,\1,; next; } else{ print "\nPlease choose one of the following directories: \n\n"; undef @dirs; opendir(BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./){next;} # skip dot files $path = $inbase.$file; if (-d $path){ push (@dirs, $file); } } close(BASE); $dircount = scalar @dirs; for ($i = 1; $i <= $dircount; $i ++){ print $i.") ".$dirs[$i-1]."\n"; } print "\n\n Type the number and press enter:\n"; $num = ; chop $num; $num --; $inbase = $inbase.$dirs[$num]."/"; } } } # now we are in the correct collection directory # check for the collection number first undef $gotit; if ($inbase =~ /([a-z]{1}[\d]{4}\_[\d]{7})/){ $collnum = $1; } else{ print "\nPlease enter the collection number\n"; $collnum = ; chomp ($collnum); } while (!$gotit){ print "\nIs $collnum the correct collection number? Y or N:\n"; $ans = ; chomp ($ans); if ($ans =~ /N/i){ print "\nPlease enter the collection number\n"; $collnum = ; chomp ($collnum); } else{ $gotit = 1; last; } } # output file will be based on collection number and timestamp $output = "/home/jeremiah/UploadArea/output/".$collnum."_".$timestamp.".txt"; $out = "UploadArea/output/".$collnum."_".$timestamp.".txt"; open (OUT, ">".$output) or die "can't open $output\n"; # first, let's do some QC on filenames. A sanity check. # not checking numbering -- sequences -- at item level. # not checking Admin or Metadata directories. print "\n\nShould we check item sequence? Y or N\n"; print "Answer Y only if it starts with item 1 and \nshould be consecutively numbered.\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $ItemNumCheck = 1; } print "\n\nPlease wait while we do a little quality control checking ...\n\n"; # let's find the Scans next. opendir(BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ $path = $inbase.$file."/"; if ($file =~ /^\./){ next;} elsif ($file =~ /Transcripts/){ push (@transdirs, $path); } elsif ($file =~ /Scans/i){ push (@scansdirs, $path); } } close(BASE); #foreach (@scansdirs){ print "scans directory: $_\n";} # now we have all scans directories in @scansdirs and all transcript directories in @transdirs # to avoid confusion in the later scripts, we'll duplicate them for this test. @testScans = @scansdirs; @testTrans = @transdirs; $coll = $collnum; # first layer in each is assumed to be item level foreach $dir (@testScans){ $count = 1; # checking sequence count of files in directories opendir (DIR, $dir) or die "can't read files in $dir\n"; while ($adir = readdir(DIR)){ if ($adir =~ /^\./ || $adir =~ /Thumbs\.db/i){next;} #skip dot files and thumbs.db ($short = $dir) =~ s,\/cifs\-mount\/Digital\_Coll\_Complete\/,,; ($parent = $short) =~ s,.*\/,,; # parent directory is what follows the last slash, yes? # we want to match for this $thispath = $dir."/".$adir; if ($adir =~ /$coll/i){ # passes first test, it matches collname # this line matches subdirectories for items containing multiple pages or sub pages # we capture the item number only # if ($adir =~ /.*\_(\d{7})(\..{4})?$/) { # check for bad numbering if ($adir =~ /.*\_(\d{7})$/) { $mynum = $1 + 0; # get rid of leading zeros # print "I think that the number for $adir is $mynum\n"; push (@itemnums, $mynum); } # this line matches files with a 3-digit extension and no page number # capture the item number elsif ($adir =~ /.*\_(\d{7})(\..{3})$/ ) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "$parent is parent: I think that the number for $adir is $mynum\n"; push (@itemnums, $mynum); } # this line matches files with a 3-digit extension and a page number # capture the page number (mynum) and add it as an itempage to the parent item elsif ($adir =~ /.*\_(\d{4})(\..{3})$/ ) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "$parent is parent: I think that the number for $adir is $mynum\n"; push (@{$itempages{$parent}}, $mynum); } # this line matches files with a 3-digit extension and a subpage number # capture the subpage number (mynum) and add it as an itemsubpage to the parent itempage elsif ($adir =~ /.*\_(\d{7}\_\d{4})\_(\d{3})(\..{3})$/ ) { # check for bad numbering $itemAndPage = $1; $mynum = $2 + 0; # get rid of leading zeros #print "collected subpage $mynum for $itemAndPage\n"; # print "$parent is parent: I think that the number for $adir is $mynum\n"; push (@{$itemsubpages{$itemAndPage}}, $mynum); } # if this is a directory, but it does not have the form similar to # u0003_0000002_0000003 or the same with _0004 or _0004_003 patterns attached # then an error is generated. Also we collect the directory for further diving for gold. if ( -d $thispath){ if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?$/)){ push (@badform, $adir); } push (@testScans, $thispath); # collect subdirectories for further investigation } else{ # not a directory; must be a file. Does it match its parent directory? if ((!($parent =~ /Scans/i)) && (!($adir =~ /$parent/))){ push (@wrongdir, $adir." parent ->$parent<- in ".$short); } if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?\.\w{3}$/)){ push (@badform, $adir); } } } else { push (@wrongdir, $adir." ".$short);} } close(DIR); } if ($ItemNumCheck){ # here we are checking sequences of item numbers; only do this if asked $count = 1; @sorteditems = sort by_number(@itemnums); if ($ItemNumCheck){ foreach $mynum (@sorteditems){ if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "item level --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "item level --> $count is missing");} } $count = $mynum + 1; } } } # now we are checking sequences of pages @itemmoms = sort by_number(keys (%itempages)); foreach $item (@itemmoms){ $count = 1; @pages = @{$itempages{$item}}; # $numpages = scalar (@pages); # print "$item has $numpages before and"; @sorted = sort by_number(@pages); $numpages = scalar (@sorted); # print "$numpages after\n"; foreach $mynum (@sorted){ if (! $mynum){ print "no page number for something in $item\n";} if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum page in $item item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "pages of $item --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "pages of $item --> $count is missing");} } $count = $mynum + 1; } undef @{$itempages{$item}}; } undef @itemnums; undef @itempages; # now we are checking sequences of subpages # push (@{$itemsubpages{$itemAndPage}}, $mynum); @itempages= sort by_number(keys (%itemsubpages)); foreach $itempage (@itempages){ $count = 1; @subpages = @{$itemsubpages{$itempage}}; @sorted = sort by_number(@subpages); $numpages = scalar (@subpages); foreach $mynum (@sorted){ if (! $mynum){ print "no page number for something in $itempage\n";} if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum page in $itempage item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "pages of $itempage --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "pages of $itempage --> $count is missing");} } $count = $mynum + 1; } undef @{$itemsubpages{$itempage}}; } undef @itemsubpages; undef @subpages; undef @itempages; if (@badform || @badcount || @wrongdir || @missing ){ print print "TROUBLE in QC: please check the $out file\nUnable to continue\n\n"; print OUT "\nTROUBLE: $collnum \n"; print OUT "-------------------------------------------\n"; if (@missing){ print OUT "\nThe following files are missing or badly named \n"; foreach (@missing){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; print OUT "Are they in the right place? Please check:\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nSome files seem to be missing, according to sequence gaps\n"; foreach (@badcount){ print OUT " $_\n";} } print "\n\nAny problems found have been output to the \n$out file\n"; exit; } print "\n\nQuality control checking completed; no problems found so far.\n\n"; # then look through directories # need to make sure there's a MODS for each item...? # expect item at first level within scans folders # expect MODS in MODS directory within Metadata undef $doMods; print "\n\nAre you uploading MODS? Y or N:\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $doMods = 1; $modsdir = $inbase."Metadata/MODS/"; $mdDir = $inbase."Metadata/"; $depoMD = $depositArea.$collnum."/Metadata/"; #look here for existing batch metadata in deposits ($mydirs = $collnum) =~ s,\_,\/,g; # substitute slashes for underscores in filename $archiveMD = $archive.$mydirs."/Metadata/"; # and look here for same in the archive # print "archive md: $archiveMD\n deposit md: $depoMD\n"; if (! -e $modsdir){ REMODS: print "Unable to locate $modsdir. \n Please enter the MODS directory within $inbase:\n"; $dir = ; chomp ($collnum); $modsdir = $inbase.$dir; if (! -e $modsdir){ goto REMODS;} } opendir(MODS, $modsdir) or die "cannot open $modsdir\n"; while ($file = readdir(MODS)){ if ($file =~ /^\./){ next;} if ($file =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7})\.mods\.xml/){ # item level only # pick up number push (@modsnums, $1); } elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7}\_\d{4}(\_\d{3})?)\.mods\.xml/){ # pages and subpage levels only # pick up number push (@modsPagenums, $1); } } if (!(@modsnums || @modsPagenums)){ print "Unable to find MODS in $modsdir\n Are you uploading MODS? Y or N:\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $doMods = 1; print "Please enter the MODS directory within $inbase\n"; $dir = ; chomp ($collnum); $modsdir = $inbase.$dir; if (! -e $modsdir){ goto REMODS;} } } } #if (@modsnums){ print OUT "here's my modsnums:\n\n"; # foreach (@modsnums){ print OUT "$_\n";} # } #if (@modsPagenums){ print OUT "here's my modsPagenums:\n\n"; # foreach (@modsPagenums){ print OUT "$_\n";} # } # so if $doMods exists, compare list of numbers in @modsnums to items in scans, and # also copy the MODS to the working directory. if ($doMods){ # no sense comparing if there aren't any # let's check for a metadata spreadsheet also. # if MODS are being uploaded, so should the spreadsheet opendir(MD, $mdDir) or die "can't look through $mdDir\n"; undef (@haveBatches); undef (@newBatches); undef $someSS; while ($file = readdir(MD)){ if ($file =~ /^($collnum(\.\d+)?\.txt)/){ push (@newBatches, $1); } } close(MD); if (! @newBatches){ print OUT "ERROR! There is no collection metadata text export in the metadata directory (properly named).\n"; print OUT "please make one and start over.\n"; print "ERROR! There is no collection metadata text export in the metadata directory (properly named).\n"; print "please make one and start over.\n\n"; exit; } # same with archiveMD and depoMD if (-e $archiveMD){ opendir(MD, $archiveMD) or die "can't look through $archiveMD\n"; while ($file = readdir(MD)){ if ($file =~ /($collnum(\.\d+)?\.txt)/){ push (@haveBatches, $1); } } close(MD); } if ( -e $depoMD){ opendir(MD, $depoMD) or die "can't look through $depoMD\n"; while ($file = readdir(MD)){ if ($file =~ /($collnum(\.\d+)?\.txt)/){ push (@haveBatches, $1); } } close(MD); } undef @alreadyHere; foreach $newbie (@newBatches){ undef $found; foreach $oldie (@haveBatches){ if ($oldie eq $newbie){ $found = 1;} } if ($found){ push (@alreadyHere, $newbie);} } if (@alreadyHere){ print OUT "\n\nERROR! Metadata files by the same name have already been uploaded:\n"; print "\n\nERROR! Metadata files by the same name have already been uploaded:\n"; foreach (@alreadyHere){ print OUT $_."\n";} foreach (@alreadyHere){ print $_."\n";} print "Is the new upload a REPAIR??? Y or N:\n\n"; $ans = ; if ($ans =~ /n/i){ print OUT "Please modify the filename to be the next batch, and start over.\n\n"; print "Please modify the filename to be the next batch, and start over.\n\n"; exit; } } print "\n\nChecking for correspondence between items and MODS\n\n"; print "\n\nPlease wait... \n\n"; @testDirs = @scansdirs; foreach $dir (@testDirs){ opendir(DIR, $dir) or die "can't open $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /thumbs\.db/i){ next;} elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7})(\.tif)?$/ ){ # we assume here that the item number is at the top level $item = $1; $path = $dir.$file; if ( -d $path){ # then this is a directory push (@testDirs, $path); $path = $dir.$file."/".$item."_0001.tif"; # look for page 1 if (! -e $path){ print "ERROR: No tiffs in $dir$file?\n"; print OUT "ERROR: No tiffs in $dir$file?\n"; next; } # sanity check; is there a tiff? } push (@scannums, $item); } elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7}\_\d{7}\_\d{4}(\_\d{3})?)(\.tif)?$/ ){ $item = $1; push (@pageNums, $item); $path = $dir.$file; if ( -d $path){ # then this is a directory push (@testDirs, $path); $path = $dir.$file."/".$item."_0001.tif"; # look for page 1 if (! -e $path){ print "ERROR: No tiffs in $dir$file?\n"; print OUT "ERROR: No tiffs in $dir$file?\n"; next; } # sanity check; is there a tiff? } } } close(DIR); } # print OUT "\n\n here's my item numbers:\n"; # foreach (@scannums){ print OUT "$_\n";} # print OUT "\n\n here's my page numbers:\n\n"; # foreach (@pageNums){ print OUT "$_\n";} # for page level, only check if there's a scan per mods, not the other way around foreach $m (@modsPagenums){ undef $found; $subpage = $m."_001"; # allow a match on subpage foreach $p (@pageNums){ if ($m eq $p){ $found = 1;} elsif ($subpage = $p){ $found = 1;} } if (!$found){ push (@missingScans, $m);} } # for item level, check for one-to-one correspondence between mods and item numbers foreach $s (@scannums){ undef $found; foreach $m (@modsnums){ if ($m eq $s){ $found = 1;} } if (!$found){ push (@missingMods, $s);} } foreach $m (@modsnums){ undef $found; foreach $s (@scannums){ if ($m eq $s){ $found = 1;} } if (!$found){ # do we have pages for this mods? undef $found; $page = $m."_0001"; $subpage = $page."_001"; foreach $p (@pageNums){ if ($page eq $p || $subpage eq $p){ $found = 1;} } if (! $found){ push (@missingScans, $m);} } } if (@missingScans || @missingMods){ print "ERROR! Cannot continue until you have a MODS for each item, and an item for each MODS\n"; print OUT "\nERROR! Cannot continue until you have a MODS for each item, and an item for each MODS\n"; if (@missingScans){ print "\nThese MODS files do not have scanned items:\n"; print OUT "\nThese MODS files do not have scanned items:\n"; foreach (@missingScans){ print " $_\n"; print OUT " $_\n";} } if (@missingMods){ print "\nThese scanned items do not have MODS:\n"; print OUT "\nThese scanned items do not have MODS:\n"; foreach (@missingMods){ print " $_\n"; print OUT " $_\n";} } print "\n\nAny problems found have been output to the command line \nand also will be found in the \n$out file\n"; exit; } else{ # copy the MODS over print "\n\nMODS check complete! Copying the MODS to the server now.\n\n"; $outmods = $outbase."MODS/."; $inmods = $modsdir."*"; print " copying $inmods $outmods\n"; `cp $inmods $outmods`; $val = `diff $modsdir $outmods`; # check to see if copy went well if (!$val){ unlink $inmods;} # if all copied, then delete share copy } } # ocr ? $admindir = $inbase."/Admin/"; $ocrlist = $admindir."ocrList.txt"; if (! -e $ocrlist){ OCRAGAIN: print "I do not find an ocrList.txt file in the Admin directory for this collection.\n"; print "Will we be creating OCR today for anything but transcripts? Y or N\n"; $ans = ; if ($ans =~ /y/i){ print "Shall we create OCR for:\n 1. all the content you're uploading\n"; print " 2. the entire collection in Acumen\n 3. only what is in the file I can't find yet\n"; print " Any other number means no OCR today!!\n\nPlease enter a number:\n\n"; $ans = ; if ($ans =~ /1/){ $ocrItAll = 1;} elsif ($ans =~ /2/){ $acumenToo = 1;} elsif ($ans =~ /3/){ print "Please name the plain text OCR list \"ocrList.txt\" and place in the admin folder.\nHit enter when this is done.\n\n"; $ans = ; if (! -e $ocrlist){ goto OCRAGAIN;} $getocr = 1; } } } else{ $getocr = 1;} TRYAGAIN: if ($getocr){ open (OCR, $ocrlist) or die "can't open $ocrlist\n"; while ($line = ){ if ($line =~ /([a-z]{1}\d{4}\_\d{7}\_\d{7}.*?)\t *1/){ # print "Found line $line\n"; push (@ocrMe, $1); } elsif (!($line =~ /[a-z]{1}\d{4}\_\d{7}\_\d{7}/)){ if ($line =~ /\w/){ # there's something there, but I don't know what print OUT "NO OCR performed for $line in ocrList.txt file \n"; print "NO OCR performed for $line in ocrList.txt file \n"; } } } close(OCR); if (! @ocrMe){ print "ERROR! I could not locate any items in ocrList.txt to OCR\n"; print "Please check file names in the list, and encoding of the file\n"; print "ANSI text file is best. I can't read UTF-16.\n\n"; print "When you have replaced the file, please hit enter\n"; $ans = ; goto TRYAGAIN; } } # call another process to make the jpegs, using nohup so it won't stop when the ssh window closes. print "\n\nWe are starting to make the JPEGS now\n"; unless (fork){ # the child process does this stuff # if we have to OCR content already uploaded but not in the archive, we'd best find those files if ($getocr || $acumenToo){ opendir(DIR, $depositArea) or die "can't look in $depositArea\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next; } #skip dot files $path = $dir.$file; if ((-d $path) && ($file =~ /$collnum/)){ # top level directory should contain collection number push (@hunting, $path."/"); } } close(DIR); foreach $dir (@hunting){ opendir(DIR, $dir) or die "can't look in $dir\n"; while ($file = readdir(DIR)){ # this overlooks uploaded transcripts because we do NOT want to overwrite other files # thus, previously uploaded transcripts will NOT be ocr'd by this script if ($file =~ /^\./ || $file =~ /Metadata/ || $file =~ /Admin/ || $file =~ /Transcripts/i){ next; } #skip dot files $path = $dir.$file; if (-d $path){ push (@hunting, $path."/"); } elsif ($file =~ /(.*?)\/.tif/){ $which = $1; $deposits{$which} = $path; # keep it for reference later } } close(DIR); } @inDeposits = keys (%deposits); } # we have all scans directories in @scansdirs and all transcript directories in @transdirs $newdir= $outbase."jpegs/"; $transdir = $outbase."transcripts/"; # now, let's make the jpegs for the scans, and then the transcripts foreach $dir (@scansdirs){ opendir(DIR, $dir) or die "can't open $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /thumbs\.db/i){ next;} $path = $dir.$file; # print "looking at $file\n"; if ($file =~ /^(.*?)(\.tif)/ ){ # if a tiff, make the jpegs and drop them on the server # print "file number is $1\n"; $filenum = $1; $big = $newdir."/".$filenum."_2048.jpg"; # $mid = $newdir."/".$filenum."_512.jpg"; $small = $newdir."/".$filenum."_128.jpg"; # create the derivatives if they don't exist # print "convert $path -strip -density 96 -resample 96x96 -resize 2048x2048 -filter Cubic -quiet $big\n\n"; if (! -e $big){ `convert $path -strip -density 96 -resample 96x96 -resize 2048x2048 -filter Cubic -quiet $big`; } # if (! -e $mid){ # `convert $path -strip -density 96 -resample 96x96 -resize 512x512 -filter Cubic -quiet $mid`; # } if (! -e $small){ `convert $path -strip -density 96 -resample 96x96 -resize 128x128 -filter Cubic -quiet $small`; } if (! -e $big ){ print OUT "ERROR: Failed to make a 2048 size jpeg for $filenum!\n"; } elsif (! -e $small){ print OUT "ERROR: Failed to make a 128 size jpeg for $filenum!\n"; } if ($ocrItAll || $acumenToo){ # ocr it $new = $transdir.$filenum.".ocr"; $val = `tesseract $path $new`; if ($val){ print OUT "ERROR $val creating ocr for $filenum\n";} } elsif ($getocr){ # have a list; see if this matches it # extract filename if ($filenum =~ /([a-z]{1}\d{4}\_\d{7}\_d{7})/){ $item = $1; undef $found; foreach $dome (@ocrMe){ if ($dome eq $item){ $found = 1;} } if ($found){ # ocr it push (@done, $dome); $new = $transdir.$filenum.".ocr"; $val = `tesseract $path $new`; if ($val){ print OUT "ERROR $val creating ocr for $filenum\n";} } } } } elsif (-d $path){ push (@scansdirs, $path."/"); } else{ print OUT "Not a tif; ignoring: $file\n";} } close(DIR); } # these for repair, later push (@dirlist, $newdir); if (@transdirs){ $newdir = $outbase."transcripts/"; push (@dirlist, $newdir); } # now transcripts, if there are any foreach $dir (@transdirs){ opendir(DIR, $dir) or die "can't open $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /thumbs\.db/i){ next;} $path = $dir.$file; if ($file =~ /^(.*?)(\.tif)?/ ){ # if a tiff, make the jpegs and drop them on the server $filenum = $1; $big = $newdir."/".$filenum."_2048.jpg"; # $mid = $newdir."/".$filenum."_512.jpg"; $small = $newdir."/".$filenum."_128.jpg"; # create the derivatives if they don't exist if (! -e $big){ `convert $path -strip -density 96 -resample 96x96 -resize 2048x2048 -filter Cubic -quiet $big`; } # if (! -e $mid){ # `convert $path -strip -density 96 -resample 96x96 -resize 512x512 -filter Cubic -quiet $mid`; # } if (! -e $small){ `convert $path -strip -density 96 -resample 96x96 -resize 128x128 -filter Cubic -quiet $small`; } sleep(1); if (! -e $big ){ print OUT "ERROR: Failed to make a 2048 size jpeg for Transcript $filenum!\n"; } if (! -e $small){ print OUT "ERROR: Failed to make a 128 size jpeg for Transcript $filenum!\n"; } # look for OCR or hand-corrected transcription. $ocrfile = $dir.$filenum.".ocr.txt"; $txtfile = $dir.$filenum.".txt"; if (-e $txtfile){ # copy it over. $new = $newdir.$filenum.".txt"; copy ($txtfile, $new); # don't delete share copy, we want it in the archive } elsif (-e $ocrfile){ $new = $newdir.$filenum.".ocr.txt"; copy ($ocrfile, $new); if (-e $new){ unlink $ocrfile;} # delete share copy } else{ # ocr it $new = $newdir.$filenum.".ocr"; $val = `tesseract $path $new`; if ($val){ print OUT "ERROR $val creating ocr for $filenum\n";} } } elsif ($file =~ /^(.*?)(\.txt)?/ ){ # if a text file, copy it over $new = $transdir.$file; copy ($path, $new); if (-e $new){ unlink $path;} } elsif (-d $path){ push (@transdirs, $path."/"); } } close(DIR); } # now repair the problems caused by thumbs in the tiffs. # goes through the directories # finds jpg files with -1.jpg and -0.jpg # renames the -0 ones, removing the -0, and # deletes the -1 files foreach $dir (@dirlist){ opendir (DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ # print "looking at $file in $dir\n"; if ($file =~ /^\./ ){ next; } # skip dot files $path = $dir."/".$file; if ( -d $path){ # must be a directory push (@dirlist, $path); } elsif ($file =~ /^(.*)\-1\.jpg/){ # jpg file `rm $path`; } elsif ($file =~ /^(.*)\-0\.jpg/){ $filenum = $1.".jpg"; $good = $dir."/".$filenum; `mv $path $good`; if (! -e $good){ print OUT "ERROR: Repair failed to move $path to $good!\n";} } } } # now... OCR of content in Acumen?? if ($acumenToo || $getocr){ # ocr stuff in Acumen if ($getocr){ # use the filenumbers to find the files. foreach $val (@ocrMe){ undef $found; foreach $no (@done){ if ($no eq $val){ $found = 1;} # did this already } if ($found){ next;} else{ ($mydirs = $val) =~ s,\_,\/,g; # substitute slashes for underscores in filename $mybase = $acumen.$mydirs."/"; if (-e $mybase){ push (@dothese, $mybase); } } } } else{ ($mydirs = $collnum) =~ s,\_,\/,g; # substitute slashes for underscores in filename $mybase = $acumen.$mydirs."/"; push (@dothese, $mybase); } # THINK ABOUT THIS: # are transcripts done first, and then other files? # we don't want to overwrite transcript ocr with file ocr!!! # note: transcripts in the archive that are NOT already OCR'd are not included here, unless # the jpeg for the transcript is in Acumen. foreach $val (@dothese){ opendir(ITEM, $val) or die "can't open $val\n"; while ($file = readdir(ITEM)){ if ($file =~ /^\./ || $file =~ /Metadata/){ next;} $path = $val.$file; if (-d $path){ push (@dothese, $path."/"); next;} elsif ($file =~ /^(.*?)\_2048\.jpg/){ $thisnum = $1; # print "found $file -- need an ocr for $thisnum\n"; # are we in a transcripts directory already? if ($val =~ /Transcripts/){ # if so, then the text would be in this same directory $anocr = $val.$thisnum.".ocr.txt"; $atext = $val.$thisnum.".txt"; $ocrOut = $val.$thisnum.".ocr"; undef $newtrans; } else { # text would be in a Transcripts subdirectory $anocr = $val."/Transcripts/".$thisnum.".ocr.txt"; $atext = $val."/Transcripts/".$thisnum.".txt"; $ocrOut = $val."/Transcripts/".$thisnum.".ocr"; $newtrans = $val."/Transcripts"; } # print "anocr: $anocr\natext: $atext\nocrOut: $ocrOut\n newtrans: $newtrans\n"; if ( (-e $anocr) || (-e $atext)){ next;} # no need to ocr this else{ # we need to find the tiff. ($mydirs = $thisnum) =~ s,\_,\/,g; # substitute slashes for underscores in filename $tiff = $archive.$mydirs."/".$thisnum.".tif"; # check for transcript tiff in archive first # if it exists, use that instead. $tstiff = $archive.$mydirs."/Transcripts/".$thisnum.".tif"; if (-e $tstiff){ $tiff = $tstiff;} # print "the tiff we need is $tiff\n"; if ( ! -e $tiff){ # let's look in the deposits directory foreach $d (@inDeposits){ if ($d eq $thisnum){ $tiff = $deposits{$d}; } } } # try again if ( ! -e $tiff){ print OUT "ERROR: Can't create OCR for $thisnum; $tiff does not exist\n";} else{ if ($newtrans){ if (! -e $newtrans){ `mkdir -m 0755 $newtrans`;} # create a directory } $check = `tesseract $tiff $ocrOut`; if ($check){ print OUT "ERROR $check creating ocr for $filenum\n";} } } } # ignore thumbnails and everything else } close(ITEM); } } print OUT "Upload of JPEGs and any transcript material is complete,\n as well as OCR creation if requested\n"; print OUT "Please check content in UploadArea, then run relocate_all\n to distribute into Acumen.\n"; print OUT "\n\nGREAT WORK!!! :-)\n\n"; close(OUT); exit; } # parent process gives a little more info, then exits print "Please check back in a few hours\n"; print "Any problems found will be in the \n$out file.\n"; print "\n\nThank you!!\n\n"; exit; sub timestamp{ #following for Windows #print "hit enter twice please\n"; #$date = `date`; #$time = `time`; #print $date."\n"; #if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ # $date = $3.$1.$2; # } #print $date."\n"; #if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ # $time = $1.$2.$3; # } #print $time."\n"; #$timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); ##$year += 1900; $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; } sub by_number {$a <=> $b;}