#!/usr/bin/perl use DBI; use File::Copy; # linkContent # links content into EAD, using PURLs. # this version for Cabaniss, to run off pre-generated jpegs on libcontent1 # copies this EAD into backup file first, then overwrites it with altered version. # in this situation, all jpegs are in one directory, and there are 3 versions. We'll # select the one ending in _2048.jpg to use (the largest) # jpeg filenames reflect the EAD id # this script parses box number, folder number # pulls out filename ids of items (not their subpages) # turns that into a link: http://acumen.lib.ua.edu/filenameID # exchanges it for a PURL # locates the correct EAD in the folder EADs # reads it in, locates the correct box number, folder number # and looks through ids of links already added to place this one in order # prints out altered EAD in place of previous version, # with component links. # jody DeRidder, 12/10/09 # form of tag insert, per Shawn Averkamp: # # # [whatever we want to call the item] # # # ##!!! Altered 12.15.09 by Shawn to add AFTER item # # #Francis William Kellogg Letter #Sc0004 #2027.01 # # # # and per Donnelly Lancaster Walton, the [whatever we want to call the item] should be: # Item # -- reflecting the sequence, starting at one, for each folder. # assume that the files in each folder are complete and in sequence. # ref numbers need to be unique. Given that item numbers are unique, we will use those # note that this assumes breakdown into series can occur at the folder level, but not below; # boxes 21 and 26 of Cabaniss contain content in 2 different series # earlier version by jody DeRidder, 11/16/09 $eadDir = "../EAD/"; $backups = "../backups/"; $content = "../jpegs/"; $coll = "u0003_0000252"; $ead = $coll.".ead.xml"; # put database login stuff here $hostname = "localhost"; $port = "3306"; $user = "addData"; $password = "moreStuff"; $database = "InfoTrack"; # script begins by backing up current EAD with timestamp # reading it in and extracting out IDs of content already linked. ×tamp; $backup = $ead.$timestamp; $output = ">../output/Linking_README_$timestamp"; open (OUTPUT, $output) or die "cannot write output errors\n"; $old = $eadDir.$ead; $new = $backups.$backup; copy ($old, $new) or die "could not make a backup copy of $ead!\n"; open (IN, $old) or die "can't read $ead\n"; while ($line = ){ if ($line =~ /connect("DBI:mysql:$database:$hostname:$port", $user, $password) or die "can't connect to database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; $sth = $dbh->prepare(" lock table lookup write") or die "Can't lock dbase tables!! : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth = $dbh->prepare(" select max(purlnum) from lookup") or print "Can't get a count! Kill me and call for help!! : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $lastone = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); $lastone ++; @mylist = sort by_number (@doThese); foreach $item (@mylist){ if ($item =~ /^[a-z]{1}\d{4}\_\d{7}\_(\d{2})(\d{2})(\d{3})$/){ $box= $1; $folder= $2; $thisItem = $3; $location{$item} = $box.'_'.$folder; # this puts the items in order within folder within boxes push (@{$whatwhere{$box}{$folder}}, $item); # print "$box, $folder for item $item\n"; } else{ push(@errors, "ERROR: Filename format is incorrect: -->$item<--\n");} } @mybox = sort by_number( keys (%whatwhere)); print " here's my boxes:\n"; foreach (@mybox){ print "box $_\n";} # here's what a box looks like. This is box 1, folder 1. Folders for a single box # may be in different places in the EAD: # # # Incoming, A - B. Cabaniss # 252.001 # 1 # # # all items need to be entered after the and prior to the closing "c" for this folder. # since folders of a single box may be in multiple places, we need to run through the EAD # once per box. @changed = @thisEad; foreach $abox (@mybox){ $lookfor = sprintf("%03d", $abox); # leftpad to 3 places. @myfolders = sort by_number (keys(%{$whatwhere{$abox}})); foreach $afolder (@myfolders){ undef @notMe; # to contain list of items already in EAD # print "$abox contains folder $afolder\n"; undef @myItems; @myItems = @{$whatwhere{$abox}{$afolder}}; $afolder += 0; # remove left padding # foreach (@myItems){ print "Item $_\n";} undef $gotIt; undef $foundBox; # exists when we find that box undef $foundFolder; # exists when we find one of the needed folders undef $foundDid; # exists when we've seen the close of the folder did undef $foundC; # need to add new values BEFORE this; finding this undefs the others, go to next folder undef @thisRound; foreach $line (@changed){ if ((! $foundBox) && $line =~ / *\d*\.$lookfor *<\/container>/){ $foundBox = 1; # print "found box $lookfor\n"; push (@thisRound, $line); } elsif($foundBox && (!$foundFolder) && (!$foundC)){ if ($line =~ /<\/c>/){ # this folder is listed somewhere else, or does not exist. # need to keep looking through the EAD. undef $foundBox; } elsif ($line =~ / *$afolder *<\/container>/){ $foundFolder = 1; # print "found folder $afolder\n"; $gotIt = 1; } push (@thisRound, $line); undef $foundC; } elsif($foundBox && $foundFolder && (!$foundC)){ # print "looking for c.\n"; if ($line =~ //){ $closeChild = 1; # close the kid first. } elsif ($closeChild && $line =~ /dao id="([^"]*)"/){ $itemThere = $1; undef $found; foreach $i (@myItems){ if ($i eq $itemThere){ $found = 1;} } if ($found){ push (@errors, "CHECK: $itemThere is already in the EAD\n"); push (@notMe, $itemThere); } } elsif($line =~ /<\/c>/ && $closeChild){ undef $closeChild; } elsif ($line =~ /<\/c>/){ $foundC = 1; # print "found the end of that c -- inserting items\n"; # must insert items before this line # # # [whatever we want to call the item] # # # # $count = 1; foreach $item (@myItems){ undef $found; # first check to see if it's already in there foreach $i (@notMe){ if ($i eq $item){ $found = 1;} } if ($found){next;} # skip this one, it's in there # instead of using count to give item number, let's use the last 3 digits of the item +0 # an item might not be digitized on purpose. ($myname = $item) =~ s,.*?(\d{3})$,\1,; $myname += 0; # print "$item is Item $myname\n"; undef $mypurlnum; $ref = "ref".$item; # fetch a purl here # check first to see if we have one yet for this item $id2009 = $dbh->quote($item); $sth = $dbh->prepare("select purlnum, realurl from lookup where id_2009 = $id2009") or die "can't look for $id2009 in database: ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't look for $id2009 in lookup : ", $sth->errstr(),"\n"; ($mypurlnum, $myrealurl) = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); $sth->finish(); if ($mypurlnum){ push (@errors, "CHECK: $id2009 already has a PURL:\n$mypurlnum $myrealurl\n"); } else{ # insert this baby $mypurlnum = $lastone; $purlnum = $dbh->quote($lastone); $lastone ++; $url = "http://acumen.lib.ua.edu/".$item; $myurl = $dbh->quote($url); $sth = $dbh->prepare("insert into lookup (dnum, id_2009, purlnum, realurl, datestamp, history) values (NULL,$id2009,$purlnum,$myurl,NULL,NULL)") or die "can't prepare insert for $item: ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't insert into lookup $id2009 : ", $sth->errstr(),"\n"; $sth->finish(); } # print "$item purlnum is $mypurlnum\n"; push (@thisRound, " \n"); push (@thisRound, " \n"); push (@thisRound, " Item $myname\n"); push (@thisRound, " \n"); push (@thisRound, " \n"); push (@thisRound, " \n"); # $count ++; } undef $foundBox; undef $foundFolder; } push (@thisRound, $line); } else{ push (@thisRound, $line);} } @changed = @thisRound; # keep altering EAD till everything found and linked, then write it out if (!$gotIt){ push(@errors, "ERROR!!! $abox $afolder is NOT IN THE EAD!! Items NOT linked!!!\n");} } } foreach (@changed){ print OUT $_;} close(OUT); $sth = $dbh->prepare(" unlock tables") or die "Can't unlock dbase tables!! : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $dbh->disconnect(); if (@errors){ print "All done. Check the README file in the output directory for errors, please!\n"; foreach (@errors){ print OUTPUT $_;} sleep(3); close(OUTPUT); } else{ print "All done! Please check finding aid for results. Thank you!\n"; sleep(3); } sub by_number {$a <=> $b;} sub timestamp{ ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); #gmtime(); $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; }