#!/usr/bin/perl use File::Copy; # ocrIt # jody DeRidder, 11/17/09 # given a set of collection names # goes through /srv/archive # finds tiff files # checks to see if OCR files for this already exist in /srv/www/htdocs/content # if not, creates directories for them there # creates derivatives # puts the derivatives there # altered to chmod and chown the content directories so scripts run by MU and DS to distribute MODS will work # check links at http://libcontent1.lib.ua.edu/content/ @colls = ("u0002_0000002", "u0002_0000006", "u0003_0001328", "u0003_0000520", "u0003_0000633", "u0004_0000001", "u0008_0000001"); #, "u0008_0000003"); # location hardcoded here: $inbase = "/srv/archive/"; $outbase = "/srv/www/htdocs/content/"; foreach $coll (@colls){ ($adir = $coll) =~ s,\_,\/,g; # substitute forward slash for underscores push (@dirlist, $inbase.$adir); } foreach $dir (@dirlist){ opendir (DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ # print "looking at $file in $dir\n"; if ($file =~ /^\./ ){ next; } # skip dot files if ($file =~ /^[a-z]{1}\d{4}/ || $file =~ /Transcripts/ || $file =~ /^\d{3,7}$/){ # if ($file =~ /^[a-z]{1}\d{4}/ || $file =~ /\d{3,7}/ ) # not using Transcripts directories # must match pattern u0003, p0004,(etc) 3-7 numbers, etc $path = $dir."/".$file; if ( -d $path){ # must be a directory push (@dirlist, $path); } elsif ($file =~ /^(.*)\.tif/){ # tif file $filenum = $1; # here, capture the path. ($newdir = $dir) =~ s,archive,www\/htdocs\/content,; $newdir .= "/Transcripts"; print "$file from $dir to $newdir\n"; # create the path in the new location if it doesn't exist if (! -e $newdir){ `mkdir -p $newdir`;} # create the derivatives if they don't exist # name them appropriately and put them in the new location $ocr = $newdir."/".$filenum.".ocr"; if (! -e $ocr){ $val = `tesseract $path $ocr`; if ($val){ print "ERROR $val creating $ocr.txt\n";} else {print "created $ocr.txt\n";} } } } } close(DIR); } # clean up `chown -R taloewald:www $outbase`; `chmod -R 775 $outbase`;