#!/usr/bin/perl use File::Copy; use DBI; # EadsToDbase # a modification of # eadToDbase to get the files in the notInDbase directory into the collection list online. # and then move these files into the deposits directory for archiving # this script should run about 24 hours after getEads # jody DeRidder, 7/21/10 # to add information to the database for collection delivery # -- dependent on filing title for alphabetization # -- dependent upon abstract for web description # opens each EAD, pulls out title, mss num, abstract, filing title # looks in InfoTrack.allColls for matching value # if no entry, makes one # if not listed as online, makes it online in Acumen # adds the Acumen url as the first one # also looks in InfoTrack.allColls for digital collections listing this one # as parentID -- if online and am putting this online, change the listLevel for # the digital collection(s) -- increment each, giving 1 to self # pull out abstract from the finding aid for the blurb. # if there is no entry already for this collection, add this blurb # sends errors to me... except database failures. ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 7/30/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $acumenStart = "http://acumen.lib.ua.edu/"; $email = "jlderidder\@ua.edu"; $hostname = "localhost"; $port = "3306"; $user = "user"; $password = "password"; $database = "InfoTrack"; $errors = "/home/jlderidder/EadErrorMessage"; $dbh = DBI->connect("DBI:mysql:$database:$hostname:$port", $user, $password) or die "can't connect to database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; $A = $dbh->quote("A"); # analog $D = $dbh->quote("D"); $one = $dbh->quote("1"); $ptype = $dbh->quote("mss"); $icon = $dbh->quote("http://libcontent1.lib.ua.edu/digital/images/mss.icon.png"); # location hardcoded here: $pickup = "/srv/deposits/EADs/notInDbase/"; $deposits = "/srv/deposits/EADs/new/"; $outbase = "/srv/www/htdocs/content/"; $new = "/cifs-mount2/new/"; $rem = "/cifs-mount2/remediated/"; $up = "/cifs-mount2/uploaded/"; ×tamp; #creates $date for foldering and $mydate for mysql $mydate = $dbh->quote($mydate); # first, test for content. opendir (NOT, $pickup) or die "can't look through $pickup\n"; while ($file = readdir(NOT)){ # print "looking at $file in pickup\n"; if ($file =~ /^\./){ next;} # skip dot files if ($file =~ /^(.*?)\.ead\.xml/){ $where{$1} = $pickup.$file; push (@todo, $file); $idToFile{$1} = $file; } } close(NOT); @ids = keys (%where); foreach $id (@ids){ $path = $where{$id}; if (! $path){ die "ERROR! I have $id but no file location in my list!\n";} if ($id =~ /^[a-z]{1}\d{4}\_(\d{7})/){ $num = $1 + 0; $ms = sprintf("%04d", $num); $mss = $dbh->quote("MSS $ms"); $pID = $dbh->quote($id); $plink = $dbh->quote($acumenStart.$id); $count = 1; # number of listings we know about so far for this id # here, open the EAD and get title, alphaBy, abstract !!!! undef $mytitle; undef $pblurb; undef $alphaBy; open(IN, $path) or die "can't open $path\n"; undef $bigline; undef $ptitle; undef $alphaBy; undef $pblurb; while ($line = ){ chomp $line; $line = &cleanup($line); #remove word encodings if ($line =~ /scopecontent/i){ last;} # only need top part of EAD $bigline .= $line; } if ($bigline =~ /]*?>(.*?)<\/abstract>/i){ $pblurb = $1; $pblurb =~ s,<[^>]*?>,,g; # remove any internal tags $pblurb =~ s, +, ,g; # remove extra spaces } if ($bigline =~ /([^<]*?)/i){ $ptitle = $1; $ptitle =~ s,<[^>]*?>,,g; # remove any internal tags $ptitle =~ s, +, ,g; # remove extra spaces } if ($bigline =~ /([^<]*?)<\/titleproper>/i){ $alphaBy = $1; $alphaBy =~ s,<[^>]*?>,,g; # remove any internal tags } if (!($ptitle && $alphaBy && $pblurb)){ push (@errors, "ERROR: CORRECT the database entries for $id!!\n\n title is -->$ptitle<-- and filing title is -->$alphaby<--\n"); push (@errors, " and blurb is -->$pblurb<-- \n\n"); } # is this collection in there? $sth = $dbh->prepare("select title, online, cannedLink from allColls where id_2009 like $pID and AnalogOrDigital like $A") or die "can't prepare select for $collnum to see if it's up: ", $dbh->errstr(),"\n"; $sth->execute() or die "can't select to see if $collnum is up: ", $dbh->errstr(),"\n"; ($atitle, $up, $link) = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); $sth->finish(); # might want to do something with these later if ($atitle && $up){ push (@errors, "$atitle is online as $atitle at $link\n"); next;} elsif ($atitle && !$up){ push (@errors, "$atitle is NOT online but in database as $atitle with this link: $link\n"); next;} elsif ($up){ push (@errors, "ERROR! $id is listed online with NO TITLE! It is analog!\n");} else{ # no title, not there $myblurb = $dbh->quote($pblurb); $mytitle = $dbh->quote($ptitle); $myalpha = $dbh->quote($alphaBy); # check to see if digital components are there. If so, the EAD listlevel should be NULL, so the digital collection will list, and the link to this finding aid should # be added to the child if not there already. # if there ARE no digital components online yet, the EAD listlevel should be 1, so it will be in the online collection list. undef $parentListLevel; undef @updates; # look for children # if already in there with their own alpha scheme, use it to keep them in order and apply list levels, then alter it to match the parent $sth = $dbh->prepare("select dnum, title, online, id_2009, alphaby, listLevel, mssUrl, mssNum, sourceCollName, blurb from allColls where parentID like $pID and AnalogOrDigital like $D or id_2009 like $pID and AnalogOrDigital like $D order by alphaBy") or die "can't prepare select for $collnum to see if it's up: ", $dbh->errstr(),"\n"; $sth->execute() or die "can't select to see if $collnum is up: ", $dbh->errstr(),"\n"; while ( ($dnum, $atitle, $up, $anid, $alph, $ll, $ablurb) = $sth->fetchrow_array()){ if ($up == 1){ $parentListLevel = "null"; # if only one of them is online, it ranks. } push (@updates, "update allColls set mssNum = $mss, mssUrl = $plink, parentID = $pID, sourceCollName = $mytitle where dnum = $myd"); } $sth->finish(); # insert this baby. #print "\ninsert into allColls (dnum, id_2009, title, listLevel, mssNum, blurb, type, alphaBy, AnalogOrDigital, online, inAcumen, iconLocation, dateLive, cannedLink) values (NULL, $pID, $mytitle, $one, $mss, $myblurb, $ptype, $myalpha, $A, $one, $one, $icon, $mydate, $plink)\n"; if ($parentListLevel){ # there's digital content, so list this as NULL $sth = $dbh->prepare("insert into allColls (dnum, id_2009, title, listLevel, mssNum, blurb, type, alphaBy, AnalogOrDigital, online, inAcumen, iconLocation, dateLive, cannedLink) values (NULL, $pID, $mytitle, NULL, $mss, $myblurb, $ptype, $myalpha, $A, $one, $one, $icon, $mydate, $plink)") or die "can't insert $pTitle parent into allColls: ", $dbh->errstr(),"\n"; $sth->execute() or die "can't execute insert for parent $pTitle: ", $dbh->errstr(),"\n"; $sth->finish(); } else{ # there's no digital content, so for now this is listed as level 1, to show up on the online collection list $sth = $dbh->prepare("insert into allColls (dnum, id_2009, title, listLevel, mssNum, blurb, type, alphaBy, AnalogOrDigital, online, inAcumen, iconLocation, dateLive, cannedLink) values (NULL, $pID, $mytitle, $one, $mss, $myblurb, $ptype, $myalpha, $A, $one, $one, $icon, $mydate, $plink)") or die "can't insert $pTitle parent into allColls: ", $dbh->errstr(),"\n"; $sth->execute() or die "can't execute insert for parent $pTitle: ", $dbh->errstr(),"\n"; $sth->finish(); } } foreach $insert (@updates){ $sth = $dbh->prepare($insert) or die "can't update $insert: ", $dbh->errstr(),"\n"; $sth->execute() or die "can't execute $insert: ", $dbh->errstr(),"\n"; $sth->finish(); } undef @updates; } } foreach $id (@ids){ $old = $where{$id}; $new = $deposits.$idToFile{$id}; # print "copying $old to $new -- then deleting old if new exists\n"; `cp $old $new`; if (-e $new ){ unlink $old;} # if copies are successful, delete original } $dbh->disconnect(); if (@errors){ open (ERRORS, ">".$errors) or die "can't write to $errors\n"; foreach (@errors){ print ERRORS "$_\n";} close(ERRORS); $subject = q("ERRORS in uploading EADs to database"); `mail -s $subject $email < $errors`; } $username="user2"; $password="password2"; $mynum = 11; if ($mynum){ $dbh = DBI->connect ("dbi:mysql:checkscripts:content.lib.ua.edu:3306", $username, $password) or &sendmessage; $h->{PrintError} = 1; $h->{RaiserError} = 1; $mynum = $dbh->quote($mynum); $sth= $dbh->prepare("insert into ran values(NULL,$mynum,NULL,NULL)"); $sth->execute or die "can't insert into checkscripts: ",$dbh->errstr(),"\n"; $sth->finish; $dbh->disconnect(); } exit; sub timestamp{ #following for Windows #print "hit enter twice please\n"; #$date = `date`; #$time = `time`; #print $date."\n"; #if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ # $date = $3.$1.$2; # } #print $date."\n"; #if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ # $time = $1.$2.$3; # } #print $time."\n"; #$timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); ##$year += 1900; $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $date = "$year$mon$mday"; #\T$hour:$min:$sec\Z"; $mydate = "$year-$mon-$day"; } sub cleanup{ $line = @_[0]; if ($line =~ /\377\376/){ die "$collfile is in UTF-16! Cannot process\nPlease repair and start over.\n\n"; #,,; # byte order mark } $line =~ s,\r,,g; # no Windows newlines $line =~ s,\. \"(\s),\.\"$1,g; # no space between period and quote # try to repair MS word encodings of hyphens, quotes, apostrophes $line =~ s,\342\200\231,',g; # if you hexdump the file, in place of an apostrophe # you will see in the word line: 342 200 231 # hexdump -cox filename > output # gives octal, hex, and characters # or you can just hexdump -c and look for those goofy things $line =~ s,\342\200\230,',g; $line =~ s,\342\200\235,",g; $line =~ s,\342\200\234,",g; $line =~ s,\342\200\233,\-\-,g; $line =~ s,\342\200\224,\-\-,g; $line =~ s,\342\200\223,\-\-,g; $line =~ s,\342\200\246,\-,g; $line =~ s,\357\277\275,\',g; $line =~ s,\222,\',g; # shows up as <92> $line =~ s,\226,\-,g; # shows up as <96> # $line =~ s,> *"(.*)" *<,>\1<,; # try to remove extraneous quotes $line =~ s, \& , \&\; ,g; #encode ampersand $line =~ s, +, ,g; # get rid of multiple spaces, added to keep lines from running together. return $line; }