#!/usr/bin/perl use File::Copy; use Time::Local; # makeAudioJpegs # jody DeRidder, 5/3/10 ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 6/10/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # expects share drive content under /cifs-mount/ # requests collection directory on share drive # locates all Scans directories and Transcript directories # creates jpegs and dumps them into /home/jeremiah/UploadArea/jpegs # for testing and distribution into Acumen. # will make another version of this to pull content from share # into deposits directory, verify the copies, and then delete on share. ×tamp; # location hardcoded here: $inbase = "/cifs-mount/Digital_Coll_Complete/"; $outbase = "/home/jeremiah/AUDIO/"; # testing #$outbase = "/home/jlderidder/UploadArea/"; opendir(BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ # print "looking at $file\n"; if ($file =~ /^\./){next;} # skip dot files $path = $inbase.$file; if (-d $path){ push (@dirs, $file); } } close(BASE); print "\n\nI'm looking through $inbase. \n Here are the directories there. \n\n"; $dircount = scalar @dirs; for ($i = 1; $i <= $dircount; $i ++){ print $i.") ".$dirs[$i-1]."\n"; } print "\nWhich directory do you want? \n Type the number and press enter:\n\n"; $num = ; chop $num; $num --; $inbase = $inbase.$dirs[$num]."/"; undef $here; while (!$here){ print "\nI'm looking through $inbase; \nis this the collection directory?\n Y or N: \n\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $here = 1; last; } else{ print "\nShould I go back up a directory level?\n Y or N: \n\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $inbase =~ s,^(.*\/).*?\/$,\1,; next; } else{ print "\nPlease choose one of the following directories: \n\n"; undef @dirs; opendir(BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./){next;} # skip dot files $path = $inbase.$file; if (-d $path){ push (@dirs, $file); } } close(BASE); $dircount = scalar @dirs; for ($i = 1; $i <= $dircount; $i ++){ print $i.") ".$dirs[$i-1]."\n"; } print "\n\n Type the number and press enter:\n"; $num = ; chop $num; $num --; $inbase = $inbase.$dirs[$num]."/"; } } } # now we are in the correct collection directory # check for the collection number first undef $gotit; if ($inbase =~ /([a-z]{1}[\d]{4}\_[\d]{7})/){ $collnum = $1; } else{ print "\nPlease enter the collection number\n"; $collnum = ; chomp ($collnum); } while (!$gotit){ print "\nIs $collnum the correct collection number? Y or N:\n"; $ans = ; chomp ($ans); if ($ans =~ /N/i){ print "\nPlease enter the collection number\n"; $collnum = ; chomp ($collnum); } else{ $gotit = 1; last; } } # output file will be based on collection number and timestamp $output = "/home/jeremiah/AUDIO/output/".$collnum."_".$timestamp.".txt"; $out = "AUDIO/output/".$collnum."_".$timestamp.".txt"; open (OUT, ">".$output) or die "can't open $output\n"; # first, let's do some QC on filenames. A sanity check. # not checking numbering -- sequences -- at item level. # not checking Admin or Metadata directories. print "\n\nShould we check item sequence? Y or N\n"; print "Answer Y only if it starts with item 1 and \nshould be consecutively numbered.\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $ItemNumCheck = 1; } print "\n\nPlease wait while we do a little quality control checking ...\n\n"; # let's find the Scans next. opendir(BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ $path = $inbase.$file."/"; if ($file =~ /^\./){ next;} elsif ($file =~ /Transcripts/){ push (@transdirs, $path); } elsif ($file =~ /Scans/i){ push (@scansdirs, $path); # use these to check MODS, not to make jpegs } } close(BASE); # now we have all scans directories in @scansdirs and all transcript directories in @transdirs # to avoid confusion in the later scripts, we'll duplicate them for this test. @testScans = @scansdirs; @testTrans = @transdirs; $coll = $collnum; # first layer in each is assumed to be item level foreach $dir (@testScans){ $count = 1; # checking sequence count of files in directories opendir (DIR, $dir) or die "can't read files in $dir\n"; while ($adir = readdir(DIR)){ if ($adir =~ /^\./ || $adir =~ /Thumbs\.db/i){next;} #skip dot files and thumbs.db ($short = $dir) =~ s,\/cifs\-mount\/Digital\_Coll\_Complete\/,,; ($parent = $short) =~ s,.*\/,,; # parent directory is what follows the last slash, yes? # we want to match for this $thispath = $dir."/".$adir; if ($adir =~ /$coll/i){ # passes first test, it matches collname if ($adir =~ /.*\_(\d{7})(\..{4})?$/) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "I think that the number for $adir is $mynum\n"; push (@itemnums, $mynum); } elsif ($adir =~ /.*\_(\d{4})(\..{3})?$/ ) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "$parent is parent: I think that the number for $adir is $mynum\n"; push (@{$itempages{$parent}}, $mynum); } elsif ($ItemNumCheck && $adir =~ /.*\_(\d{4})(\..{3})?$/ ) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "$thispath: I think that the number for $adir is $mynum\n"; if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "Error: diff: $diff; mynum $mynum; count $count; $thispath $adir\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "$short $adir --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "$short $adir --> $count is missing");} } $count = $mynum + 1; } if ( -d $thispath){ if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?$/)){ push (@badform, $adir); } push (@testScans, $thispath); # collect subdirectories for further investigation } else{ # not a directory; must be a file. Does it match its parent directory? if ((!($parent =~ /Scans/i)) && (!($adir =~ /$parent/))){ push (@wrongdir, $adir." parent ->$parent<- in ".$short); } if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?\.\w{3}$/)){ push (@badform, $adir); } # pull off the last set of numbers before the extension elsif ($parent =~ /Scans/i || (!$parent)){ next;} # don't check non-compound objects for sequence } } else { push (@wrongdir, $adir." ".$short);} } close(DIR); } $count = 1; @sorteditems = sort by_number(@itemnums); if ($ItemNumCheck){ foreach $mynum (@sorteditems){ if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "item level --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "item level --> $count is missing");} } $count = $mynum + 1; } } @itemmoms = sort by_number(keys (%itempages)); foreach $item (@itemmoms){ $count = 1; @pages = @{$itempages{$item}}; # $numpages = scalar (@pages); # print "$item has $numpages before and"; # @sorted = sort by_number(@pages); # $numpages = scalar (@sorted); # print "$numpages after\n"; foreach $mynum (@sorted){ if (! $mynum){ print "no page number for something in $item\n";} if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum page in $item item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "pages of $item --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "pages of $item --> $count is missing");} } $count = $mynum + 1; } undef @{$itempages{$item}}; } undef @itemnums; undef @itempages; if (@badform || @badcount || @wrongdir || @missing ){ print print "TROUBLE in QC: please check the $out file\nUnable to continue\n\n"; print OUT "\nTROUBLE: $topdir \n"; print OUT "-------------------------------------------\n"; if (@missing){ print OUT "\nThe following files are missing or badly named \n"; foreach (@missing){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; print OUT "Are they in the right place? Please check:\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nSome files seem to be missing, according to sequence gaps\n"; foreach (@badcount){ print OUT " $_\n";} } print "\n\nAny problems found have been output to the command line \nand also will be found in the \n$out file\n"; exit; } print "\n\nQuality control checking completed; no problems found so far.\n\n"; # then look through directories # need to make sure there's a MODS for each item...? # expect item at first level within scans folders # expect MODS in MODS directory within Metadata undef $doMods; print "\n\nAre you uploading MODS? Y or N:\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $doMods = 1; $modsdir = $inbase."Metadata/MODS/"; if (! -e $modsdir){ REMODS: print "Unable to locate $modsdir. \n Please enter the MODS directory within $inbase:\n"; $dir = ; chomp ($collnum); $modsdir = $inbase.$dir; if (! -e $modsdir){ goto REMODS;} } opendir(MODS, $modsdir) or die "cannot open $modsdir\n"; while ($file = readdir(MODS)){ if ($file =~ /^\./){ next;} if ($file =~ /^(.*)\.mods\.xml/){ # pick up number push (@modsnums, $1); } } if (!@modsnums){ print "Unable to find MODS in $modsdir\n Are you uploading MODS? Y or N:\n"; $ans = ; chomp ($ans); if ($ans =~ /Y/i){ $doMods = 1; print "Please enter the MODS directory within $inbase\n"; $dir = ; chomp ($collnum); $modsdir = $inbase.$dir; if (! -e $modsdir){ goto REMODS;} } } } # so if $doMods exists, compare list of numbers in @modsnums to items in scans, and # also copy the MODS to the working directory. if ($doMods){ # no sense comparing if there aren't any print "\n\nChecking for one-to-one correspondence between items and MODS\n\n"; print "\n\nPlease wait... \n\n"; foreach $dir (@scansdirs){ opendir(DIR, $dir) or die "can't open $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /thumbs\.db/i){ next;} elsif ($file =~ /^(.*?)(\.tif)?$/ ){ # we assume here that the item number is at the top level $item = $1; $path = $dir.$file; if ( -d $path){ # then this is a directory $path = $dir.$file."/".$item."_0001.tif"; # look for page 1 if (! -e $path){ print "ERROR: No tiffs in $dir$file?\n"; print OUT "ERROR: No tiffs in $dir$file?\n"; next; } # sanity check; is there a tiff? } push (@scannums, $item); } } close(DIR); } foreach $s (@scannums){ undef $found; foreach $m (@modsnums){ if ($m eq $s){ $found = 1;} } if (!$found){ push (@missingMods, $s);} } foreach $m (@modsnums){ undef $found; foreach $s (@scannums){ if ($m eq $s){ $found = 1;} } if (!$found){ push (@missingScans, $m);} } if (@missingScans || @missingMods){ print "ERROR! Cannot continue until you have a MODS for each item, and an item for each MODS\n"; print OUT "\nERROR! Cannot continue until you have a MODS for each item, and an item for each MODS\n"; if (@missingScans){ print "\nThese MODS files do not have scanned items:\n"; print OUT "\nThese MODS files do not have scanned items:\n"; foreach (@missingScans){ print " $_\n"; print OUT " $_\n";} } if (@missingMods){ print "\nThese scanned items do not have MODS:\n"; print OUT "\nThese scanned items do not have MODS:\n"; foreach (@missingMods){ print " $_\n"; print OUT " $_\n";} } print "\n\nAny problems found have been output to the command line \nand also will be found in the \n$out file\n"; exit; } else{ # copy the MODS over print "\n\nMODS check complete! Copying the MODS to the server now.\n\n"; $outmods = $outbase."MODS/."; $inmods = $modsdir."*"; print " copying $inmods $outmods\n"; `cp $inmods $outmods`; $val = `diff -r $inmods $outmods`; if (! $val){ `rm -r $inmods`;} else{ print "ERROR in copying MODS: $val\n"; print OUT "ERROR in copying MODS: $val\n"; } } } # call another process to make the jpegs, using nohup so it won't stop when the ssh window closes. print "\n\nWe are starting to make the JPEGS now\n"; unless (fork){ # the child process does this stuff # we have all scans directories in @scansdirs and all transcript directories in @transdirs # let's move the mp3 files to the server $mp3dir = $outbase."mp3/"; undef (@dirs); foreach $dir (@scansdirs){ opendir(DIR, $dir) or die "can't open $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /thumbs\.db/i){ next;} $path = $dir.$file; if (-d $path){ push (@scansdirs, $path."/"); } elsif ($file =~ /\.mp3/i){ $new = $mp3dir.$file; copy ($path, $new); if (-e $new){ unlink $path;} # deletes the copy on the share drive } } # ignore other files } # for audio, transcript jpegs go into jpegs. # ocr and corrected ocr goes into ocr. $newdir= $outbase."jpegs/"; $ocrdir = $outbase."ocr/"; # now, let's make the jpegs for the scans, and then the transcripts foreach $dir (@transdirs){ opendir(DIR, $dir) or die "can't open $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /thumbs\.db/i){ next;} $path = $dir.$file; # print "looking at $file\n"; if ($file =~ /^(.*?)(\.tif)/ ){ # if a tiff, make the jpegs and drop them on the server # print "file number is $1\n"; $filenum = $1; $big = $newdir."/".$filenum."_2048.jpg"; # $mid = $newdir."/".$filenum."_512.jpg"; $small = $newdir."/".$filenum."_128.jpg"; # create the derivatives if they don't exist # print "convert $path -strip -density 96 -resample 96x96 -resize 2048x2048 -filter Cubic -quiet $big\n\n"; if (! -e $big){ `convert $path -strip -density 96 -resample 96x96 -resize 2048x2048 -filter Cubic -quiet $big`; } # if (! -e $mid){ # `convert $path -strip -density 96 -resample 96x96 -resize 512x512 -filter Cubic -quiet $mid`; # } if (! -e $small){ `convert $path -strip -density 96 -resample 96x96 -resize 128x128 -filter Cubic -quiet $small`; } # look for OCR or hand-corrected transcription. $ocrfile = $dir.$filenum.".ocr.txt"; $txtfile = $dir.$filenum.".txt"; if (-e $txtfile){ # copy it over. $new = $ocrdir.$filenum.".txt"; copy ($txtfile, $new); # don't delete share copy, we want it in the archive } elsif (-e $ocrfile){ $new = $ocrdir.$filenum.".ocr.txt"; copy ($ocrfile, $new); if (-e $new){ unlink $ocrfile;} # delete share copy } else{ # ocr it $new = $ocrdir.$filenum.".ocr"; $val = `tesseract $path $new`; if ($val){ print OUT "ERROR $val creating OCR for $filenum\n";} } } elsif (-d $path){ push (@transdirs, $path."/"); } else{ print OUT "Not a tif; ignoring: $file\n";} } close(DIR); } # these for repair, later push (@dirlist, $newdir); # now repair the problems caused by thumbs in the tiffs. # goes through the directories # finds jpg files with -1.jpg and -0.jpg # renames the -0 ones, removing the -0, and # deletes the -1 files foreach $dir (@dirlist){ opendir (DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ # print "looking at $file in $dir\n"; if ($file =~ /^\./ ){ next; } # skip dot files $path = $dir."/".$file; if ( -d $path){ # must be a directory push (@dirlist, $path); } elsif ($file =~ /^(.*)\-1\.jpg/){ # jpg file `rm $path`; } elsif ($file =~ /^(.*)\-0\.jpg/){ $filenum = $1.".jpg"; $good = $dir."/".$filenum; `mv $path $good`; } } } print OUT "Upload of JPEGs and any transcript material is complete.\n"; print OUT "Please check content in UploadArea, then run relocate_all\n to distribute into Acumen.\n"; print OUT "\n\nGREAT WORK!!! :-)\n\n"; close(OUT); exit; } # parent process gives a little more info, then exits print "Please check back in a few hours\n"; print "Any problems found will be in the \n$out file.\n"; print "\n\nThank you!!\n\n"; exit; sub timestamp{ #following for Windows #print "hit enter twice please\n"; #$date = `date`; #$time = `time`; #print $date."\n"; #if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ # $date = $3.$1.$2; # } #print $date."\n"; #if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ # $time = $1.$2.$3; # } #print $time."\n"; #$timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); ##$year += 1900; $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; } sub by_number {$a <=> $b;}