#!/usr/bin/perl
use DBI;
use File::Copy;
# linkContent
# links content into EAD, using PURLs.
# this version for Cabaniss, to run off pre-generated jpegs on libcontent1
# copies this EAD into backup file first, then overwrites it with altered version.
# in this situation, all jpegs are in one directory, and there are 3 versions. We'll
# select the one ending in _2048.jpg to use (the largest)
# jpeg filenames reflect the EAD id
# this script parses box number, folder number
# pulls out filename ids of items (not their subpages)
# turns that into a link: http://acumen.lib.ua.edu/filenameID
# exchanges it for a PURL
# locates the correct EAD in the folder EADs
# reads it in, locates the correct box number, folder number
# and looks through ids of links already added to place this one in order
# prints out altered EAD in place of previous version,
# with component links.
##Copyright (c) 2009, The University of Alabama Libraries.
## Contributed by Jody DeRidder, 12/10/09.
##All rights reserved.
##Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the
## distribution.
## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# jody DeRidder, 12/10/09
# form of tag insert, per Shawn Averkamp:
#
#
# [whatever we want to call the item]
#
#
#
##!!! Altered 12.15.09 by Shawn to add AFTER item
#
#
#Francis William Kellogg Letter
#Sc0004
#2027.01
#
#
#
# and per Donnelly Lancaster Walton, the [whatever we want to call the item] should be:
# Item # -- reflecting the sequence, starting at one, for each folder.
# assume that the files in each folder are complete and in sequence.
# ref numbers need to be unique. Given that item numbers are unique, we will use those
# note that this assumes breakdown into series can occur at the folder level, but not below;
# boxes 21 and 26 of Cabaniss contain content in 2 different series
# earlier version by jody DeRidder, 11/16/09
$eadDir = "../EAD/";
$backups = "../backups/";
$content = "../jpegs/";
$coll = "u0003_0000252";
$ead = $coll.".ead.xml";
# put database login stuff here
$hostname = "localhost";
$port = "3306";
$user = "addData";
$password = "moreStuff";
$database = "InfoTrack";
# script begins by backing up current EAD with timestamp
# reading it in and extracting out IDs of content already linked.
×tamp;
$backup = $ead.$timestamp;
$output = ">../output/Linking_README_$timestamp";
open (OUTPUT, $output) or die "cannot write output errors\n";
$old = $eadDir.$ead;
$new = $backups.$backup;
copy ($old, $new) or die "could not make a backup copy of $ead!\n";
open (IN, $old) or die "can't read $ead\n";
while ($line = ){
if ($line =~ /$old";
open (OUT, $ead) or die "can't open $ead\n";
# get the last purlnum from the database, and lock the tables...
$dbh = DBI->connect("DBI:mysql:$database:$hostname:$port",
$user, $password) or die "can't connect to database: ",$DBI::errstr,"\n";
$h->{PrintError} = 1;
$h->{RaiseError} = 1;
$sth = $dbh->prepare(" lock table lookup write")
or die "Can't lock dbase tables!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$sth = $dbh->prepare(" select max(purlnum) from lookup")
or print "Can't get a count! Kill me and call for help!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$lastone = $sth->fetchrow_array();
warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err();
$lastone ++;
@mylist = sort by_number (@doThese);
foreach $item (@mylist){
if ($item =~ /^[a-z]{1}\d{4}\_\d{7}\_(\d{2})(\d{2})(\d{3})$/){
$box= $1;
$folder= $2;
$thisItem = $3;
$location{$item} = $box.'_'.$folder;
# this puts the items in order within folder within boxes
push (@{$whatwhere{$box}{$folder}}, $item);
# print "$box, $folder for item $item\n";
}
else{ push(@errors, "ERROR: Filename format is incorrect: -->$item<--\n");}
}
@mybox = sort by_number( keys (%whatwhere));
print " here's my boxes:\n";
foreach (@mybox){ print "box $_\n";}
# here's what a box looks like. This is box 1, folder 1. Folders for a single box
# may be in different places in the EAD:
#
#
# Incoming, A - B. Cabaniss
# 252.001
# 1
#
#
# all items need to be entered after the and prior to the closing "c" for this folder.
# since folders of a single box may be in multiple places, we need to run through the EAD
# once per box.
@changed = @thisEad;
foreach $abox (@mybox){
$lookfor = sprintf("%03d", $abox); # leftpad to 3 places.
@myfolders = sort by_number (keys(%{$whatwhere{$abox}}));
foreach $afolder (@myfolders){
undef @notMe; # to contain list of items already in EAD
# print "$abox contains folder $afolder\n";
undef @myItems;
@myItems = @{$whatwhere{$abox}{$afolder}};
$afolder += 0; # remove left padding
# foreach (@myItems){ print "Item $_\n";}
undef $gotIt;
undef $foundBox; # exists when we find that box
undef $foundFolder; # exists when we find one of the needed folders
undef $foundDid; # exists when we've seen the close of the folder did
undef $foundC; # need to add new values BEFORE this; finding this undefs the others, go to next folder
undef @thisRound;
foreach $line (@changed){
if ((! $foundBox) && $line =~ / *\d*\.$lookfor *<\/container>/){
$foundBox = 1;
# print "found box $lookfor\n";
push (@thisRound, $line);
}
elsif($foundBox && (!$foundFolder) && (!$foundC)){
if ($line =~ /<\/c>/){ # this folder is listed somewhere else, or does not exist.
# need to keep looking through the EAD.
undef $foundBox;
}
elsif ($line =~ / *$afolder *<\/container>/){
$foundFolder = 1;
# print "found folder $afolder\n";
$gotIt = 1;
}
push (@thisRound, $line);
undef $foundC;
}
elsif($foundBox && $foundFolder && (!$foundC)){
# print "looking for c.\n";
if ($line =~ //){
$closeChild = 1; # close the kid first.
}
elsif ($closeChild && $line =~ /dao id="([^"]*)"/){
$itemThere = $1;
undef $found;
foreach $i (@myItems){
if ($i eq $itemThere){ $found = 1;}
}
if ($found){
push (@errors, "CHECK: $itemThere is already in the EAD\n");
push (@notMe, $itemThere);
}
}
elsif($line =~ /<\/c>/ && $closeChild){
undef $closeChild;
}
elsif ($line =~ /<\/c>/){
$foundC = 1;
# print "found the end of that c -- inserting items\n";
# must insert items before this line
#
#
# [whatever we want to call the item]
#
#
#
# $count = 1;
foreach $item (@myItems){
undef $found;
# first check to see if it's already in there
foreach $i (@notMe){
if ($i eq $item){ $found = 1;}
}
if ($found){next;} # skip this one, it's in there
# instead of using count to give item number, let's use the last 3 digits of the item +0
# an item might not be digitized on purpose.
($myname = $item) =~ s,.*?(\d{3})$,\1,;
$myname += 0;
# print "$item is Item $myname\n";
undef $mypurlnum;
$ref = "ref".$item;
# fetch a purl here
# check first to see if we have one yet for this item
$id2009 = $dbh->quote($item);
$sth = $dbh->prepare("select purlnum, realurl from lookup where id_2009 = $id2009")
or die "can't look for $id2009 in database: ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't look for $id2009 in lookup : ", $sth->errstr(),"\n";
($mypurlnum, $myrealurl) = $sth->fetchrow_array();
warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err();
$sth->finish();
if ($mypurlnum){
push (@errors, "CHECK: $id2009 already has a PURL:\n$mypurlnum $myrealurl\n");
}
else{ # insert this baby
$mypurlnum = $lastone;
$purlnum = $dbh->quote($lastone);
$lastone ++;
$url = "http://acumen.lib.ua.edu/".$item;
$myurl = $dbh->quote($url);
$sth = $dbh->prepare("insert into lookup (dnum, id_2009, purlnum, realurl, datestamp, history) values (NULL,$id2009,$purlnum,$myurl,NULL,NULL)")
or die "can't prepare insert for $item: ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't insert into lookup $id2009 : ", $sth->errstr(),"\n";
$sth->finish();
}
# print "$item purlnum is $mypurlnum\n";
push (@thisRound, " \n");
push (@thisRound, " \n");
push (@thisRound, " Item $myname\n");
push (@thisRound, " \n");
push (@thisRound, " \n");
push (@thisRound, " \n");
# $count ++;
}
undef $foundBox;
undef $foundFolder;
}
push (@thisRound, $line);
}
else{ push (@thisRound, $line);}
}
@changed = @thisRound; # keep altering EAD till everything found and linked, then write it out
if (!$gotIt){ push(@errors, "ERROR!!! $abox $afolder is NOT IN THE EAD!! Items NOT linked!!!\n");}
}
}
foreach (@changed){ print OUT $_;}
close(OUT);
$sth = $dbh->prepare(" unlock tables")
or die "Can't unlock dbase tables!! : ",$dbh->errstr(),"\n";
$sth->execute()
or die "Can't execute SQL statement: ", $sth->errstr(),"\n";
$dbh->disconnect();
if (@errors){
print "All done. Check the README file in the output directory for errors, please!\n";
foreach (@errors){ print OUTPUT $_;}
sleep(3);
close(OUTPUT);
}
else{
print "All done! Please check finding aid for results. Thank you!\n";
sleep(3);
}
sub by_number {$a <=> $b;}
sub timestamp{
($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); #gmtime();
$mon ++;
if ($mon < 10){ $mon="0".$mon;} #need 2 digits
if ($sec < 10){ $sec="0".$sec;}
if ($min < 10){ $min="0".$min;}
if ($hour < 10){ $hour="0".$hour;}
if ($mday < 10){ $mday="0".$mday;}
$year = $year + 1900;
$timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z";
}