#!/usr/bin/perl use File::Copy; # ocrSelected # jody DeRidder, 10/12/10 # looks in /srv/deposits/ocrMe for files ending in ocrList.txt # each of which contains a list of item numbers to be digitized followed by 1 (yes) or 0 (no) # collects the item numbers followed by a 1 # goes through /srv/archive # finds tiff files # checks to see if OCR files for this already exist in /srv/www/htdocs/content # if not, creates directories for them there # creates derivatives # puts the derivatives there ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 7/30/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $lists = "/srv/deposits/ocrMe/"; opendir(LISTS, $lists) or die "can't look in $lists\n"; while ($file = readdir(LISTS)){ if ($file =~ /^\./){ next;} # no dot files if ($file =~ /ocrList\.txt/i){ $gotone = $lists.$file; # print "found $gotone\n"; open (IN, $gotone) or die "can't open $gotone\n"; while ($line = ){ if ($line =~ /^(([a-z]{1}\d{4}\_\d{7})\_\d{7})\s+1/){ $coll = $2; $id = $1; push (@{$idlist{$coll}}, $id); # hash of collection numbers as keys, arrays of ids as values push (@items, $id); } else { print "no match: $line\n";} } close(IN); } } close(LISTS); @colls = keys(%idlist); #foreach $coll (@colls){ # @mylist = @{$idlist{$coll}}; # print "\n$coll:\n"; # foreach (@mylist){ print " $_\n";} # } #exit; # location hardcoded here: $inbase = "/srv/archive/"; $outbase = "/srv/www/htdocs/content/"; foreach $coll (@colls){ ($adir = $coll) =~ s,\_,\/,g; # substitute forward slash for underscores push (@dirlist, $inbase.$adir); } foreach $dir (@dirlist){ opendir (DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ # print "looking at $file in $dir\n"; if ($file =~ /^\./ ){ next; } # skip dot files if ($file =~ /^[a-z]{1}\d{4}/ || $file =~ /Transcripts/ || $file =~ /^\d{3,7}$/){ # if ($file =~ /^[a-z]{1}\d{4}/ || $file =~ /\d{3,7}/ ) # not using Transcripts directories # must match pattern u0003, p0004,(etc) 3-7 numbers, etc $path = $dir."/".$file; if ( -d $path){ # must be a directory push (@dirlist, $path); } elsif ($file =~ /^(.*)\.tif/){ # tif file $filenum = $1; undef $found; foreach $id (@items){ if ($filenum =~ /$id/){ $found = 1; last; } } if ($found){ # here, capture the path. ($newdir = $dir) =~ s,archive,www\/htdocs\/content,; $newdir .= "/Transcripts"; print "$file from $dir to $newdir\n"; # create the path in the new location if it doesn't exist if (! -e $newdir){ undef $test; $mypath = $newdir; $mypath =~ s,$outbase,,; while ($mypath =~ /(.*?\/)(.*)/){ $test .= $1; $mypath = $2; $apath = $outbase.$test; if (!-e $apath){ `mkdir -m 0775 $apath`; } } $test .= $mypath; $apath = $outbase.$test; if (!-e $apath){ `mkdir -m 0775 $apath`; } } # create the derivatives if they don't exist # name them appropriately and put them in the new location $ocr = $newdir."/".$filenum.".ocr"; if (! -e $ocr){ $val = `tesseract $path $ocr`; if ($val){ print "ERROR $val creating $ocr.txt\n";} else {print "created $ocr.txt\n";} } } } } } close(DIR); } # clean up `chown -R taloewald:www $outbase`; `chmod -R 775 $outbase`;