#!/usr/bin/perl use File::Copy; use Time::Local; # makeCabanissJpegs # jody DeRidder, 5/3/10 # expects share drive content under /cifs-mount/ # requests collection directory on share drive # locates all Scans directories and Transcript directories # creates jpegs and dumps them into /home/jeremiah/UploadArea/jpegs # for testing and distribution into Acumen. # will make another version of this to pull content from share # into deposits directory, verify the copies, and then delete on share. ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 10/10/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ×tamp; # location hardcoded here: $inbase = "/cifs-mount/Digital_Coll_Complete/u0003_0000252_Cabaniss/"; $newdir = "/home/jeremiah/CABANISS/jpegs/"; $collnum = "u0003_0000252"; opendir(BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ # print "looking at $file\n"; if ($file =~ /^\./){next;} # skip dot files $path = $inbase.$file; if (-d $path){ push (@dirs, $file); } } close(BASE); RESELECT: $dircount = scalar @dirs; print "\n\nI'm looking through $inbase. \n Here are the directories there. \n\n"; $ans = "y"; while ($ans =~ /y/i){ for ($i = 1; $i <= $dircount; $i ++){ print $i.") ".$dirs[$i-1]."\n"; } print "\nWhich directory do you want? \n Type the number and press enter:\n\n"; $num = ; chop $num; $num --; push (@scansdirs, $inbase.$dirs[$num]."/"); print "Do you want to select another directory also? Y or N:\n"; $ans = ; } # output file will be based on collection number and timestamp $output = "/home/jeremiah/CABANISS/output/".$collnum."_".$timestamp.".txt"; $out = "CABANISS/output/".$collnum."_".$timestamp.".txt"; open (OUT, ">".$output) or die "can't open $output\n"; print "Performing a bit of QC first. Please be patient...\n"; @mydirs = @scansdirs; foreach $dir (@mydirs){ opendir(DIR, $dir) or die "can't read through $dir directory\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /Admin/){ next; # skip dot files } $path = $dir.$file; if (-d $path){ if ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros $bf = $boxnum."_".$foldernum; push (@{$ItemSeq{$bf}}, $item_sequence); # print OUT "$boxnum $foldernum $item_sequence $file\n"; } } push (@mydirs, $path."/"); } elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7})(\_(\d{4})(\_(\d{3}))?)?\.tif$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; $thisPage = $4 +0 ; $thisSubpage = $6 + 0; if ($4){ $apage = 1;} else{ undef $apage;} if ($6) {$asubpage = 1;} else{ undef $asubpage;} # print "collnum $collnum, item $itemID this Item $thisItem, this page $thisPage\n"; if ($thisItem =~ /^(\d{2})(\d{2})(\d{3})/){ $boxnum = $1; $foldernum = $2; $item_sequence = $3 + 0; # gets rid of padding zeros } else{ push (@badform, "Can't get boxnum, foldernum, item sequence from $thisItem");} $bf = $boxnum."_".$foldernum; if (! $apage){ # print OUT "$boxnum $foldernum $item_sequence $file\n"; push (@{$ItemSeq{$bf}}, $item_sequence); } # this catches tifs not in subdirectories if ($asubpage){ push(@{$itemPageSubs{$thisItem."_".$thisPage}}, $thisSubpage); } $tifcount ++; push (@{$collnumItems{$collnum}{$file}}, $dir); # keeps list of directories where items are found undef $box; undef $folder; ($checkdir = $dir) =~ s,.*?(u0003\_0000252),u0003\_0000252,; # pull off first part of this directory name # what is the series number and folder number? if ($checkdir =~ /Box\_(\d{1,2})(_[^\/]*)?\/Folder\_(\d{1,2})(_[^\/]*)?\/([^\/]+\/)?/i){ $box = sprintf ("%02d", $1); # left pad if needed $folder = sprintf ("%02d",$3); $itemdir = $5; if (($box ne $boxnum) || ($folder ne $foldernum)){ push(@badform, "$file name does not reflect Box $box Folder $folder at $path");} $itemdir =~ s,\/,,g; if ($itemdir){ if (!$thisPage){push (@errors, "$file is in $itemdir directory, but has no page number\n");} elsif ($itemdir ne $itemID){ push (@wrongdir, "$file is in $itemdir; no match on item id\n");} elsif (! $asubpage){ push (@{$pages{$itemdir}}, $thisPage );} # get page number sequence without the padding } } elsif ($checkdir =~ /Box\_(\d{1,2})(_[^\/]*)?\/([^\/]+\/)?/i){ $box = sprintf ("%02d", $1); # left pad if needed $itemdir = $3; if (($foldernum != 0) || ($box ne $boxnum)){ push(@badform, "$file name does not reflect Box $box (no subfolders) at $path");} $itemdir =~ s,\/,,g; if ($itemdir){ if (!$thisPage){push (@errors, "$file is in $itemdir directory, but has no page number\n");} elsif ($itemdir ne $itemID){ push (@wrongdir, "$file is in $itemdir; no match on item id\n");} elsif (! $asubpage){ push (@{$pages{$itemdir}}, $thisPage );} # get page number sequence without the padding } } else{ push (@errors, "cannot parse out box and folder from $path\n");} } else{ if ($file =~ /thumbs\.db/i || $path =~ /Admin/){ next;} push (@badform, "$file in $checkdir"); } } close(DIR); } $subtract = 0; @allfiles = sort by_number (keys (%{$collnumItems{$collnum}})); foreach $file (@allfiles){ # print "found $file\n"; @mydirs = @{$collnumItems{$collnum}{$file}}; $numdirs = scalar (@mydirs); if ($numdirs > 1){ $collect = join (" \n", @mydirs); push (@dupes, "$file exists in more than one place:\n $collect"); $subtract += $numdirs -1; # subtract from total count of files } } @boxfolders = sort keys(%ItemSeq); foreach $bf (@boxfolders){ ($box, $folder) = split ("_", $bf); $count = 1; # print "Box $box Folder $folder"; @myItems = sort by_number( @{$ItemSeq{$bf}}); $numItems = scalar (@myItems); push (@foldercount, "Box $box Folder $folder contains $numItems items"); $totalItems += $numItems; foreach $mynum (@myItems){ if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "In Box $box Folder $folder: $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "In Box $box Folder $folder: $count is missing");} } $count = $mynum + 1; } } # for ($i = 1; $i <= $numItems; $i++){ # if ($myItems[$i-1] ne $i){ # push (@badcount, "In Box $box Folder $folder: expected sequence $i instead of ".$myItems[$i-1]); # } # } @hasPages = sort keys(%pages); foreach $doc (@hasPages){ @mypages = sort by_number( @{$pages{$doc}}); $numItems = scalar (@mypages); push (@pagecounts, "$doc has $numItems pages"); $count = 1; foreach $mynum (@mypages){ if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "$doc page problem: $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "$folder $doc page problem: $count is missing");} } $count = $mynum + 1; } } # for ($i = 1; $i <= $numItems; $i++){ # if ($mypages[$i-1] ne $i){ # push (@badcount, "$doc page problem: expected sequence $i instead of ".$mypages[$i-1]); # } # } # } #push(@{$itemPageSubs{$thisItem."_"$thisPage}}, $thisSubpage); @itempages = sort keys (%itemPageSubs); foreach $itempage (@itempages){ @mypages = sort by_number( @{$pages{$doc}}); $numItems = scalar (@mypages); push (@pagecounts, "$doc has $numItems pages"); $count = 1; foreach $mynum (@mypages){ if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "$itempage subpage problem: $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "$itempage subpage problem: $count is missing");} } $count = $mynum + 1; } } $all = (scalar @allfiles) - $subtract; print "\nTotal count of valid content in this upload:\n $totalItems items and $all digital files\n"; #print "tiffcount $tifcount\n"; if (@badform || @badcount || @wrongdir || @errors){ print OUT "\nTROUBLE: \n"; print OUT "-------------------------------------------\n"; } else{ print OUT "All is GREAT!! GOOD WORK!! :-) \n"; } if (@errors){ print OUT "\nThese general errors impeded this script, so everything may NOT have been checked:\n"; print OUT "Please repair and run this again.\n"; foreach (@errors){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nThe following folders or items have sequence problems:\n"; foreach (@badcount){ print OUT " $_\n";} } print OUT "\n\nHere's some more info:\nFOLDER COUNTS:\n_________________\n"; foreach (@foldercount){ print OUT $_."\n";} print OUT "\n\nPAGE COUNTS:\n_______________________\n"; foreach (@pagecounts){ print OUT $_."\n";} if (@errors){ print "Errors were found; please check $out and correct them\n then rerun this script.\n"; close OUT; exit; } # call another process to make the jpegs, using nohup so it won't stop when the ssh window closes. print "\n\nQC complete! It all looks great!!\n"; print "\n\nWe are starting to make the JPEGS now.\n"; print "I will be picking up the following directories. Y or N?\n"; foreach (@scansdirs){ print " $_\n";} $ans = ; if ($ans =~ /n/i){ print "Let's try again, then.\n"; goto RESELECT; } unless (fork){ # the child process does this stuff # we have all scans directories in @scansdirs and all transcript directories in @transdirs # now, let's make the jpegs for the scans foreach $dir (@scansdirs){ opendir(DIR, $dir) or die "can't open $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /thumbs\.db/i){ next;} $path = $dir.$file; # print "looking at $file\n"; if ($file =~ /^(.*?)(\.tif)/ ){ # if a tiff, make the jpegs and drop them on the server # print "file number is $1\n"; $filenum = $1; $big = $newdir."/".$filenum."_2048.jpg"; # $mid = $newdir."/".$filenum."_512.jpg"; $small = $newdir."/".$filenum."_128.jpg"; # create the derivatives if they don't exist # print "convert $path -strip -density 96 -resample 96x96 -resize 2048x2048 -filter Cubic -quiet $big\n\n"; if (! -e $small){ `convert $path -strip -density 96 -resample 96x96 -resize 128x128 -filter Cubic -quiet $small`; } if (! -e $big){ `convert $path -strip -density 96 -resample 96x96 -resize 2048x2048 -filter Cubic -quiet $big`; } # if (! -e $mid){ # `convert $path -strip -density 96 -resample 96x96 -resize 512x512 -filter Cubic -quiet $mid`; # } sleep(1); # give it time to write the files if (! -e $small){ print OUT "ERROR: Failed to make a 128 size jpeg for $filenum!\n"; } if (! -e $big ){ print OUT "ERROR: Failed to make a 2048 size jpeg for $filenum!\n"; } } elsif (-d $path){ push (@scansdirs, $path."/"); } else{ print OUT "Not a tif; ignoring: $file\n";} } close(DIR); } # these for repair, later push (@dirlist, $newdir); # now repair the problems caused by thumbs in the tiffs. # goes through the directories # finds jpg files with -1.jpg and -0.jpg # renames the -0 ones, removing the -0, and # deletes the -1 files foreach $dir (@dirlist){ opendir (DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ # print "looking at $file in $dir\n"; if ($file =~ /^\./ ){ next; } # skip dot files $path = $dir."/".$file; if ( -d $path){ # must be a directory push (@dirlist, $path); } elsif ($file =~ /^(.*)\-1\.jpg/){ # jpg file `rm $path`; } elsif ($file =~ /^(.*)\-0\.jpg/){ $filenum = $1.".jpg"; $good = $dir."/".$filenum; `mv $path $good`; if (! -e $good){ print OUT "ERROR: Repair failed to move $path to $good!\n";} } } } print OUT "Upload of JPEGs and any transcript material is complete.\n"; print OUT "Please check content in UploadArea, then run relocate_all\n to distribute into Acumen.\n"; print OUT "\n\nGREAT WORK!!! :-)\n\n"; close(OUT); exit; } # parent process gives a little more info, then exits print "Please check back in a few hours\n"; print "Any problems found will be in the \n$out file.\n"; print "\n\nThank you!!\n\n"; exit; sub timestamp{ #following for Windows #print "hit enter twice please\n"; #$date = `date`; #$time = `time`; #print $date."\n"; #if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ # $date = $3.$1.$2; # } #print $date."\n"; #if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ # $time = $1.$2.$3; # } #print $time."\n"; #$timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); ##$year += 1900; $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; } sub by_number {$a <=> $b;}