#!/usr/bin/perl use DBI; # linkContent # given tiffs located in Scans directories organized by: # Box_# # Folder_# # with filenames that reflect the EAD id # this script parses box number, folder number # pulls out filename ids of items (not their subpages) # either from tiff at this level, or directory containing page files # turns that into a link: http://acumen.lib.ua.edu/filenameID # exchanges it for a PURL # locates the correct EAD in the folder EADs # reads it in, locates the correct box number, folder number # and looks through ids of links already added to place this one in order # prints out altered EAD to UpdatedEads folder # with component links. # form of tag insert, per Shawn Averkamp: # # # [whatever we want to call the item] # # # # and per Donnelly Lancaster Walton, the [whatever we want to call the item] should be: # Item # -- reflecting the sequence, starting at one, for each folder. # assume that the files in each folder are complete and in sequence. # ref numbers need to be unique. Given that item numbers are unique, we will use those # note that this assumes breakdown into series can occur at the folder level, but not below; # boxes 21 and 26 of Cabaniss contain content in 2 different series # jody DeRidder, 11/16/09 $eadDir = "./EADs/"; $outdir = "./UpdatedEADs/"; $content = "./content/"; # put database login stuff here $hostname = "localhost"; $port = "3306"; $user = "username"; $password = "password"; $database = "InfoTrack"; $dbh = DBI->connect("DBI:mysql:$database:$hostname:$port", $user, $password) or die "can't connect to database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; $sth = $dbh->prepare(" lock table lookup write") or die "Can't lock dbase tables!! : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth = $dbh->prepare(" select max(purlnum) from lookup") or print "Can't get a count! Kill me and call for help!! : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $lastone = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); $lastone ++; push (@mydirs, $content); # first, look for content #opendir(CONTENT, $content) or die "can't read through $content directory\n"; #while ($file = readdir(CONTENT)){ # if ($file =~ /^\./){ # next; # skip dot files # } # elsif ($file =~ /Scan/i){ # $path = $content.$file; # if (-d $path){ # push (@mydirs, $path."/"); # } # } # } #close(CONTENT); foreach $dir (@mydirs){ opendir(DIR, $dir) or die "can't read through $dir directory\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){ next; # skip dot files } $path = $dir.$file; if (-d $path){ push (@mydirs, $path."/"); } elsif ($file =~ /^([a-z]{1}\d{4}\_\d{7})\_(\d{7}).*?\.tif$/){ $collnum = $1; $itemID = $1."_".$2; $thisItem = $2; print "collnum $collnum, item $itemID\n"; # multiple pages may exist -- need to be sure at least 1 tiff exists, so can't use # directory name. We want one link per intellectual item, so one entry here. if (! $collnum{$itemID}){ push(@{$collnumItems{$collnum}}, $thisItem); undef $box; undef $folder; # what is the series number and folder number? if ($path =~ /Box\_(\d{1,4})(_[^\/]*)?\/Folder\_(\d{1,4})(_[^\/]*)?\//i){ $box = sprintf ("%02d", $1); # left pad if needed $folder = sprintf ("%02d",$3); # print "Folder $folder; Box $box\n"; } else{ print "ERROR: cannot parse out box and folder from $path\n";} $collnum{$itemID} = $box."_".$folder; # print "Assigned $itemID ".$collnum{$itemID}."\n"; } } } close(DIR); } @colls = keys (%collnumItems); foreach $coll (@colls){ $eadnum = $coll.".ead.xml"; $eadpath = $eadDir.$eadnum; if (! -e $eadpath){ print "NEED $eadpath!\n";} else{ open(EAD, $eadpath) or die "can't read in $eadpath\n"; while ($line = ){ push (@thisEad, $line); } close(EAD); $outfile = $outdir.$eadnum; open (OUT, ">".$outfile) or die "can't open $outfile\n"; @mylist = sort by_number (@{$collnumItems{$coll}}); foreach $thisun (@mylist){ $itemnum = $coll."_".$thisun; ($box, $folder) = split ("_", $collnum{$itemnum}); #print "$itemnum box $box, folder $folder\n"; # need to know which series to look for, and which folders within them. push (@{$whatwhere{$box}{$folder}}, $itemnum) # this puts the items in order within folder within series } @mybox = sort by_number( keys (%whatwhere)); print " here's my boxes:\n"; foreach (@mybox){ print "box $_\n";} # here's what a box looks like. This is box 1, folder 1. Folders for a single box # may be in different places in the EAD: # # # Incoming, A - B. Cabaniss # 252.001 # 1 # # # all items need to be entered after the and prior to the closing "c" for this folder. # since folders of a single box may be in multiple places, we need to run through the EAD # once per box. @changed = @thisEad; foreach $abox (@mybox){ $lookfor = sprintf("%03d", $abox); # leftpad to 3 places. @myfolders = sort by_number (keys(%{$whatwhere{$abox}})); foreach $afolder (@myfolders){ undef @notMe; # to contain list of items already in EAD print "$abox contains folder $afolder\n"; undef @myItems; @myItems = @{$whatwhere{$abox}{$afolder}}; $afolder += 0; # remove left padding foreach (@myItems){ print "Item $_\n";} undef $gotIt; undef $foundBox; # exists when we find that box undef $foundFolder; # exists when we find one of the needed folders undef $foundDid; # exists when we've seen the close of the folder did undef $foundC; # need to add new values BEFORE this; finding this undefs the others, go to next folder undef @thisRound; foreach $line (@changed){ if ((! $foundBox) && $line =~ / *\d*\.$lookfor *<\/container>/){ $foundBox = 1; print "#found box $lookfor\n"; push (@thisRound, $line); } elsif($foundBox && (!$foundFolder) && (!$foundC)){ if ($line =~ /<\/c>/){ # this folder is listed somewhere else, or does not exist. # need to keep looking through the EAD. undef $foundBox; } elsif ($line =~ / *$afolder *<\/container>/){ $foundFolder = 1; print "found folder $afolder\n"; $gotIt = 1; } push (@thisRound, $line); undef $foundC; } elsif($foundBox && $foundFolder && (!$foundC)){ # print "looking for c.\n"; if ($line =~ //){ $closeChild = 1; # close the kid first. } elsif ($closeChild && $line =~ /dao id="([^"]*)"/){ $itemThere = $1; undef $found; foreach $i (@myItems){ if ($i eq $itemThere){ $found = 1;} } if ($found){ print "ERROR: $itemThere is already in the EAD\n"; push (@notMe, $itemThere); } } elsif($line =~ /<\/c>/ && $closeChild){ undef $closeChild; } elsif ($line =~ /<\/c>/){ $foundC = 1; print "found the end of that c -- inserting items\n"; # must insert items before this line # # # [whatever we want to call the item] # # # # $count = 1; foreach $item (@myItems){ undef $found; # first check to see if it's already in there foreach $i (@notMe){ if ($i eq $item){ $found = 1;} } if ($found){next;} # skip this one, it's in there # instead of using count to give item number, let's use the last 3 digits of the item +0 # an item might not be digitized on purpose. ($myname = $item) =~ s,.*?(\d{3})$,\1,; $myname += 0; print "$item is Item $myname\n"; undef $mypurlnum; $ref = "ref".$item; # fetch a purl here # check first to see if we have one yet for this item $id2009 = $dbh->quote($item); $sth = $dbh->prepare("select purlnum, realurl from lookup where id_2009 = $id2009") or die "can't look for $id2009 in database: ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't look for $id2009 in lookup : ", $sth->errstr(),"\n"; ($mypurlnum, $myrealurl) = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); $sth->finish(); if ($mypurlnum){ print "ERROR: $id2009 already in lookup table:\n$mypurlnum $myrealurl\n"; } else{ # insert this baby $mypurlnum = $lastone; $purlnum = $dbh->quote($lastone); $lastone ++; $url = "http://acumen.lib.ua.edu/".$item; $myurl = $dbh->quote($url); $sth = $dbh->prepare("insert into lookup (dnum, id_2009, purlnum, realurl, datestamp, history) values (NULL,$id2009,$purlnum,$myurl,NULL,NULL)") or die "can't prepare insert for $item: ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't insert into lookup $id2009 : ", $sth->errstr(),"\n"; $sth->finish(); } print "$item purlnum is $mypurlnum\n"; push (@thisRound, " \n"); push (@thisRound, " \n"); push (@thisRound, " Item $myname\n"); push (@thisRound, " \n"); push (@thisRound, " \n"); push (@thisRound, " \n"); # $count ++; } undef $foundBox; undef $foundFolder; } push (@thisRound, $line); } else{ push (@thisRound, $line);} } @changed = @thisRound; # keep altering EAD till everything found and linked, then write it out if (!$gotIt){ print "ERROR!!! $abox $afolder is NOT IN THE EAD!! Items NOT linked!!!\n";} } } foreach (@changed){ print OUT $_;} close(OUT); } } $sth = $dbh->prepare(" unlock tables") or die "Can't unlock dbase tables!! : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $dbh->disconnect(); sub by_number {$a <=> $b;}