#!/usr/bin/perl
use DBI;
# linkContent
# given tiffs located in Scans directories organized by:
# Box_#
# Folder_#
# with filenames that reflect the EAD id
# this script parses box number, folder number
# pulls out filename ids of items (not their subpages)
# either from tiff at this level, or directory containing page files
# turns that into a link: http://acumen.lib.ua.edu/filenameID
# exchanges it for a PURL
# locates the correct EAD in the folder EADs
# reads it in, locates the correct box number, folder number
# and looks through ids of links already added to place this one in order
# prints out altered EAD to UpdatedEads folder
# with component links.
# form of tag insert, per Shawn Averkamp:
#
#
# [whatever we want to call the item]
#
#
#
# and per Donnelly Lancaster Walton, the [whatever we want to call the item] should be:
# Item # -- reflecting the sequence, starting at one, for each folder.
# assume that the files in each folder are complete and in sequence.
# ref numbers need to be unique. Given that item numbers are unique, we will use those
# note that this assumes breakdown into series can occur at the folder level, but not below;
# boxes 21 and 26 of Cabaniss contain content in 2 different series
# jody DeRidder, 11/16/09
$eadDir = "./EADs/";
$outdir = "./UpdatedEADs/";
$content = "./content/";
# put database login stuff here
$hostname = "localhost";
$port = "3306";
$user = "username";
$password = "password";
$database = "InfoTrack";
$dbh = DBI->connect("DBI:mysql:$database:$hostname:$port",
$user, $password) or die "can't connect to database: ",$DBI::errstr,"\n";
$h->{PrintError} = 1;
$h->{RaiseError} = 1;
$sth = $dbh->prepare(" lock table lookup write")
or die "Can't lock dbase tables!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$sth = $dbh->prepare(" select max(purlnum) from lookup")
or print "Can't get a count! Kill me and call for help!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$lastone = $sth->fetchrow_array();
warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err();
$lastone ++;
push (@mydirs, $content);
# first, look for content
#opendir(CONTENT, $content) or die "can't read through $content directory\n";
#while ($file = readdir(CONTENT)){
# if ($file =~ /^\./){
# next; # skip dot files
# }
# elsif ($file =~ /Scan/i){
# $path = $content.$file;
# if (-d $path){
# push (@mydirs, $path."/");
# }
# }
# }
#close(CONTENT);
foreach $dir (@mydirs){
opendir(DIR, $dir) or die "can't read through $dir directory\n";
while ($file = readdir(DIR)){
if ($file =~ /^\./){
next; # skip dot files
}
$path = $dir.$file;
if (-d $path){
push (@mydirs, $path."/");
}
elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7}).*?\.tif$/){
$collnum = $1;
$itemID = $1."_".$2;
$thisItem = $2;
print "collnum $collnum, item $itemID\n";
# multiple pages may exist -- need to be sure at least 1 tiff exists, so can't use
# directory name. We want one link per intellectual item, so one entry here.
if (! $collnum{$itemID}){
push(@{$collnumItems{$collnum}}, $thisItem);
undef $box;
undef $folder;
# what is the series number and folder number?
if ($path =~ /Box\_(\d{1,4})(_[^\/]*)?\/Folder\_(\d{1,4})(_[^\/]*)?\//i){
$box = sprintf ("%02d", $1); # left pad if needed
$folder = sprintf ("%02d",$3);
# print "Folder $folder; Box $box\n";
}
else{ print "ERROR: cannot parse out box and folder from $path\n";}
$collnum{$itemID} = $box."_".$folder;
# print "Assigned $itemID ".$collnum{$itemID}."\n";
}
}
}
close(DIR);
}
@colls = keys (%collnumItems);
foreach $coll (@colls){
$eadnum = $coll.".ead.xml";
$eadpath = $eadDir.$eadnum;
if (! -e $eadpath){ print "NEED $eadpath!\n";}
else{
open(EAD, $eadpath) or die "can't read in $eadpath\n";
while ($line = ){
push (@thisEad, $line);
}
close(EAD);
$outfile = $outdir.$eadnum;
open (OUT, ">".$outfile) or die "can't open $outfile\n";
@mylist = sort by_number (@{$collnumItems{$coll}});
foreach $thisun (@mylist){
$itemnum = $coll."_".$thisun;
($box, $folder) = split ("_", $collnum{$itemnum});
#print "$itemnum box $box, folder $folder\n";
# need to know which series to look for, and which folders within them.
push (@{$whatwhere{$box}{$folder}}, $itemnum) # this puts the items in order within folder within series
}
@mybox = sort by_number( keys (%whatwhere));
print " here's my boxes:\n";
foreach (@mybox){ print "box $_\n";}
# here's what a box looks like. This is box 1, folder 1. Folders for a single box
# may be in different places in the EAD:
#
#
# Incoming, A - B. Cabaniss
# 252.001
# 1
#
#
# all items need to be entered after the and prior to the closing "c" for this folder.
# since folders of a single box may be in multiple places, we need to run through the EAD
# once per box.
@changed = @thisEad;
foreach $abox (@mybox){
$lookfor = sprintf("%03d", $abox); # leftpad to 3 places.
@myfolders = sort by_number (keys(%{$whatwhere{$abox}}));
foreach $afolder (@myfolders){
undef @notMe; # to contain list of items already in EAD
print "$abox contains folder $afolder\n";
undef @myItems;
@myItems = @{$whatwhere{$abox}{$afolder}};
$afolder += 0; # remove left padding
foreach (@myItems){ print "Item $_\n";}
undef $gotIt;
undef $foundBox; # exists when we find that box
undef $foundFolder; # exists when we find one of the needed folders
undef $foundDid; # exists when we've seen the close of the folder did
undef $foundC; # need to add new values BEFORE this; finding this undefs the others, go to next folder
undef @thisRound;
foreach $line (@changed){
if ((! $foundBox) && $line =~ / *\d*\.$lookfor *<\/container>/){
$foundBox = 1;
print "#found box $lookfor\n";
push (@thisRound, $line);
}
elsif($foundBox && (!$foundFolder) && (!$foundC)){
if ($line =~ /<\/c>/){ # this folder is listed somewhere else, or does not exist.
# need to keep looking through the EAD.
undef $foundBox;
}
elsif ($line =~ / *$afolder *<\/container>/){
$foundFolder = 1;
print "found folder $afolder\n";
$gotIt = 1;
}
push (@thisRound, $line);
undef $foundC;
}
elsif($foundBox && $foundFolder && (!$foundC)){
# print "looking for c.\n";
if ($line =~ //){
$closeChild = 1; # close the kid first.
}
elsif ($closeChild && $line =~ /dao id="([^"]*)"/){
$itemThere = $1;
undef $found;
foreach $i (@myItems){
if ($i eq $itemThere){ $found = 1;}
}
if ($found){
print "ERROR: $itemThere is already in the EAD\n";
push (@notMe, $itemThere);
}
}
elsif($line =~ /<\/c>/ && $closeChild){
undef $closeChild;
}
elsif ($line =~ /<\/c>/){
$foundC = 1;
print "found the end of that c -- inserting items\n";
# must insert items before this line
#
#
# [whatever we want to call the item]
#
#
#
# $count = 1;
foreach $item (@myItems){
undef $found;
# first check to see if it's already in there
foreach $i (@notMe){
if ($i eq $item){ $found = 1;}
}
if ($found){next;} # skip this one, it's in there
# instead of using count to give item number, let's use the last 3 digits of the item +0
# an item might not be digitized on purpose.
($myname = $item) =~ s,.*?(\d{3})$,\1,;
$myname += 0;
print "$item is Item $myname\n";
undef $mypurlnum;
$ref = "ref".$item;
# fetch a purl here
# check first to see if we have one yet for this item
$id2009 = $dbh->quote($item);
$sth = $dbh->prepare("select purlnum, realurl from lookup where id_2009 = $id2009")
or die "can't look for $id2009 in database: ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't look for $id2009 in lookup : ", $sth->errstr(),"\n";
($mypurlnum, $myrealurl) = $sth->fetchrow_array();
warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err();
$sth->finish();
if ($mypurlnum){
print "ERROR: $id2009 already in lookup table:\n$mypurlnum $myrealurl\n";
}
else{ # insert this baby
$mypurlnum = $lastone;
$purlnum = $dbh->quote($lastone);
$lastone ++;
$url = "http://acumen.lib.ua.edu/".$item;
$myurl = $dbh->quote($url);
$sth = $dbh->prepare("insert into lookup (dnum, id_2009, purlnum, realurl, datestamp, history) values (NULL,$id2009,$purlnum,$myurl,NULL,NULL)")
or die "can't prepare insert for $item: ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't insert into lookup $id2009 : ", $sth->errstr(),"\n";
$sth->finish();
}
print "$item purlnum is $mypurlnum\n";
push (@thisRound, " \n");
push (@thisRound, " \n");
push (@thisRound, " Item $myname\n");
push (@thisRound, " \n");
push (@thisRound, " \n");
push (@thisRound, " \n");
# $count ++;
}
undef $foundBox;
undef $foundFolder;
}
push (@thisRound, $line);
}
else{ push (@thisRound, $line);}
}
@changed = @thisRound; # keep altering EAD till everything found and linked, then write it out
if (!$gotIt){ print "ERROR!!! $abox $afolder is NOT IN THE EAD!! Items NOT linked!!!\n";}
}
}
foreach (@changed){ print OUT $_;}
close(OUT);
}
}
$sth = $dbh->prepare(" unlock tables")
or die "Can't unlock dbase tables!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$dbh->disconnect();
sub by_number {$a <=> $b;}