#!/usr/bin/perl
use DBI;
use File::Copy;
# linkContent
# links content into EAD, using PURLs.
# this version for Cabaniss, to run off pre-generated jpegs on libcontent1
# copies this EAD into backup file first, then overwrites it with altered version.
# in this situation, all jpegs are in one directory, and there are 3 versions. We'll
# select the one ending in _2048.jpg to use (the largest)
# jpeg filenames reflect the EAD id
# this script parses box number, folder number
# pulls out filename ids of items (not their subpages)
# turns that into a link: http://acumen.lib.ua.edu/filenameID
# exchanges it for a PURL
# locates the correct EAD in the folder EADs
# reads it in, locates the correct box number, folder number
# and looks through ids of links already added to place this one in order
# prints out altered EAD in place of previous version,
# with component links.
# jody DeRidder, 12/10/09
# form of tag insert, per Shawn Averkamp:
#
#
# [whatever we want to call the item]
#
#
#
##!!! Altered 12.15.09 by Shawn to add AFTER item
#
#
#Francis William Kellogg Letter
#Sc0004
#2027.01
#
#
#
# and per Donnelly Lancaster Walton, the [whatever we want to call the item] should be:
# Item # -- reflecting the sequence, starting at one, for each folder.
# assume that the files in each folder are complete and in sequence.
# ref numbers need to be unique. Given that item numbers are unique, we will use those
# note that this assumes breakdown into series can occur at the folder level, but not below;
# boxes 21 and 26 of Cabaniss contain content in 2 different series
# earlier version by jody DeRidder, 11/16/09
$eadDir = "../EAD/";
$backups = "../backups/";
$content = "../jpegs/";
$coll = "u0003_0000252";
$ead = $coll.".ead.xml";
# put database login stuff here
$hostname = "localhost";
$port = "3306";
$user = "addData";
$password = "moreStuff";
$database = "InfoTrack";
# script begins by backing up current EAD with timestamp
# reading it in and extracting out IDs of content already linked.
×tamp;
$backup = $ead.$timestamp;
$output = ">../output/Linking_README_$timestamp";
open (OUTPUT, $output) or die "cannot write output errors\n";
$old = $eadDir.$ead;
$new = $backups.$backup;
copy ($old, $new) or die "could not make a backup copy of $ead!\n";
open (IN, $old) or die "can't read $ead\n";
while ($line = ){
if ($line =~ /$old";
open (OUT, $ead) or die "can't open $ead\n";
# get the last purlnum from the database, and lock the tables...
$dbh = DBI->connect("DBI:mysql:$database:$hostname:$port",
$user, $password) or die "can't connect to database: ",$DBI::errstr,"\n";
$h->{PrintError} = 1;
$h->{RaiseError} = 1;
$sth = $dbh->prepare(" lock table lookup write")
or die "Can't lock dbase tables!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$sth = $dbh->prepare(" select max(purlnum) from lookup")
or print "Can't get a count! Kill me and call for help!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$lastone = $sth->fetchrow_array();
warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err();
$lastone ++;
@mylist = sort by_number (@doThese);
foreach $item (@mylist){
if ($item =~ /^[a-z]{1}\d{4}\_\d{7}\_(\d{2})(\d{2})(\d{3})$/){
$box= $1;
$folder= $2;
$thisItem = $3;
$location{$item} = $box.'_'.$folder;
# this puts the items in order within folder within boxes
push (@{$whatwhere{$box}{$folder}}, $item);
# print "$box, $folder for item $item\n";
}
else{ push(@errors, "ERROR: Filename format is incorrect: -->$item<--\n");}
}
@mybox = sort by_number( keys (%whatwhere));
print " here's my boxes:\n";
foreach (@mybox){ print "box $_\n";}
# here's what a box looks like. This is box 1, folder 1. Folders for a single box
# may be in different places in the EAD:
#
#
# Incoming, A - B. Cabaniss
# 252.001
# 1
#
#
# all items need to be entered after the and prior to the closing "c" for this folder.
# since folders of a single box may be in multiple places, we need to run through the EAD
# once per box.
@changed = @thisEad;
foreach $abox (@mybox){
$lookfor = sprintf("%03d", $abox); # leftpad to 3 places.
@myfolders = sort by_number (keys(%{$whatwhere{$abox}}));
foreach $afolder (@myfolders){
undef @notMe; # to contain list of items already in EAD
# print "$abox contains folder $afolder\n";
undef @myItems;
@myItems = @{$whatwhere{$abox}{$afolder}};
$afolder += 0; # remove left padding
# foreach (@myItems){ print "Item $_\n";}
undef $gotIt;
undef $foundBox; # exists when we find that box
undef $foundFolder; # exists when we find one of the needed folders
undef $foundDid; # exists when we've seen the close of the folder did
undef $foundC; # need to add new values BEFORE this; finding this undefs the others, go to next folder
undef @thisRound;
foreach $line (@changed){
if ((! $foundBox) && $line =~ / *\d*\.$lookfor *<\/container>/){
$foundBox = 1;
# print "found box $lookfor\n";
push (@thisRound, $line);
}
elsif($foundBox && (!$foundFolder) && (!$foundC)){
if ($line =~ /<\/c>/){ # this folder is listed somewhere else, or does not exist.
# need to keep looking through the EAD.
undef $foundBox;
}
elsif ($line =~ / *$afolder *<\/container>/){
$foundFolder = 1;
# print "found folder $afolder\n";
$gotIt = 1;
}
push (@thisRound, $line);
undef $foundC;
}
elsif($foundBox && $foundFolder && (!$foundC)){
# print "looking for c.\n";
if ($line =~ //){
$closeChild = 1; # close the kid first.
}
elsif ($closeChild && $line =~ /dao id="([^"]*)"/){
$itemThere = $1;
undef $found;
foreach $i (@myItems){
if ($i eq $itemThere){ $found = 1;}
}
if ($found){
push (@errors, "CHECK: $itemThere is already in the EAD\n");
push (@notMe, $itemThere);
}
}
elsif($line =~ /<\/c>/ && $closeChild){
undef $closeChild;
}
elsif ($line =~ /<\/c>/){
$foundC = 1;
# print "found the end of that c -- inserting items\n";
# must insert items before this line
#
#
# [whatever we want to call the item]
#
#
#
# $count = 1;
foreach $item (@myItems){
undef $found;
# first check to see if it's already in there
foreach $i (@notMe){
if ($i eq $item){ $found = 1;}
}
if ($found){next;} # skip this one, it's in there
# instead of using count to give item number, let's use the last 3 digits of the item +0
# an item might not be digitized on purpose.
($myname = $item) =~ s,.*?(\d{3})$,\1,;
$myname += 0;
# print "$item is Item $myname\n";
undef $mypurlnum;
$ref = "ref".$item;
# fetch a purl here
# check first to see if we have one yet for this item
$id2009 = $dbh->quote($item);
$sth = $dbh->prepare("select purlnum, realurl from lookup where id_2009 = $id2009")
or die "can't look for $id2009 in database: ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't look for $id2009 in lookup : ", $sth->errstr(),"\n";
($mypurlnum, $myrealurl) = $sth->fetchrow_array();
warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err();
$sth->finish();
if ($mypurlnum){
push (@errors, "CHECK: $id2009 already has a PURL:\n$mypurlnum $myrealurl\n");
}
else{ # insert this baby
$mypurlnum = $lastone;
$purlnum = $dbh->quote($lastone);
$lastone ++;
$url = "http://acumen.lib.ua.edu/".$item;
$myurl = $dbh->quote($url);
$sth = $dbh->prepare("insert into lookup (dnum, id_2009, purlnum, realurl, datestamp, history) values (NULL,$id2009,$purlnum,$myurl,NULL,NULL)")
or die "can't prepare insert for $item: ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't insert into lookup $id2009 : ", $sth->errstr(),"\n";
$sth->finish();
}
# print "$item purlnum is $mypurlnum\n";
push (@thisRound, " \n");
push (@thisRound, " \n");
push (@thisRound, " Item $myname\n");
push (@thisRound, " \n");
push (@thisRound, " \n");
push (@thisRound, " \n");
# $count ++;
}
undef $foundBox;
undef $foundFolder;
}
push (@thisRound, $line);
}
else{ push (@thisRound, $line);}
}
@changed = @thisRound; # keep altering EAD till everything found and linked, then write it out
if (!$gotIt){ push(@errors, "ERROR!!! $abox $afolder is NOT IN THE EAD!! Items NOT linked!!!\n");}
}
}
foreach (@changed){ print OUT $_;}
close(OUT);
$sth = $dbh->prepare(" unlock tables")
or die "Can't unlock dbase tables!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$dbh->disconnect();
if (@errors){
print "All done. Check the README file in the output directory for errors, please!\n";
foreach (@errors){ print OUTPUT $_;}
sleep(3);
close(OUTPUT);
}
else{
print "All done! Please check finding aid for results. Thank you!\n";
sleep(3);
}
sub by_number {$a <=> $b;}
sub timestamp{
($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); #gmtime();
$mon ++;
if ($mon < 10){ $mon="0".$mon;} #need 2 digits
if ($sec < 10){ $sec="0".$sec;}
if ($min < 10){ $min="0".$min;}
if ($hour < 10){ $hour="0".$hour;}
if ($mday < 10){ $mday="0".$mday;}
$year = $year + 1900;
$timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z";
}