#!/usr/bin/perl use File::Copy; use DBI; # relocating ##Copyright (c) 2009, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 6/11/09. ##All rights reserved. ##Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the ## distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # jody DeRidder, 6/11/09 # this version to reorder content from share drive # into archival storage system # first, do a diff from /srv/deposits/content to the share drive area # to see if we got the content ok; then delete off share # then go to the collstuff directory ../collstuff and run collToDbase # to put the collection info in the database. # MODIFY to add links for online in Tonio's software # WARNING! Before running this, do a chmod -R 755 /archive # after running it, do chmod -R 555 /archive # this will enable you to write to the directories as root -- # and then close off that ability afterwards. # after writing this, add links/manifests as needed on top level of /srv/www/htdocs/lockss/ directories # (should be indicated by this script's output) # check links at http://libcontent1.lib.ua.edu/lockss/Manifest.html # then run checkem in this directory -- it will verify md5 sums of content that was moved, # and delete from the deposits directory. # then go look in the deposits directory, make sure folders are clean, and delete them. # Note that this EXPECTS scans in a Scans directory!! not on top level. #$test = 1; # location hardcoded here: $inbase = "/srv/deposits/content/"; $outbase = "/srv/archive/"; $linkbase = "http://libcontent1.lib.ua.edu/lockss"; $moveme = "./moveme"; # this is the list of copied files to be checked via md5; if they are the same, the old file will be deleted. If not, an error is issued. $hostname = "localhost"; $port = "3306"; $user = "username"; $password = "password"; $database = "InfoTrack"; #$oaibase = "oai:content.lib.ua.edu:$m/"; # add dnum $dbh = DBI->connect("DBI:mysql:$database:$hostname:$port", $user, $password) or die "can't connect to database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; open (MOVE, ">".$moveme) or die "can't add to $moveme\n"; # COMMENT OUT and uncomment manifest line below when ready to write manifests if ($test){ open (OUT, ">>RelocateManifests") or die "can't write to RelocateManifests \n";} opendir (BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./ ){ next; } # skip dot files # print "looking at $file\n"; if ($file =~ /^([a-z]{1}[\d]{4})\_([\d]{7})(\_)?([\d]{7})?/){ $topdir = $1; $secdir = $2; $thirdir = $4; if (! $thirdir){ $collnum = $topdir."_".$secdir; $collbase = $outbase.$topdir."/".$secdir."/"; } else{ $collnum = $topdir."_".$secdir."_".$thirdir; $collbase = $outbase.$topdir."/".$secdir."/".$thirdir."/"; $middleman = $outbase.$topdir."/".$secdir."/Documentation/Manifest.html"; $upperman = $outbase.$topdir."/Documentation/Manifest.html"; if (-e $upperman){ # need to alter this manifest to link in this collection print "ALTER MANIFEST: add $collnum to $upperman\n"; ${$alterMan{$upperman}}{$topdir."_".$secdir} = " "; # add title later -- is there one? } else{ # need to create this manifest print "CREATE MANIFEST: add $collnum to $upperman\n"; ${$makeMan{$upperman}}{$topdir."_".$secdir} =" "; # add title later -- is there one? } $olddir = $inbase.$file."/"; if (! -e $collbase){ `mkdir -p $collbase`; } # we will look here for png, jpg, gif, txt, rtf files # rename the image and text files for the collection number, and # store under "Documentation" at the collection level $admindir = $inbase.$file."/Admin"; # print "old admin dir for $file is $admindir\n"; $docdir = $collbase."Documentation"; $oldMDdir = $inbase.$file."/Metadata/"; $newMDdir = $collbase."Metadata"; $oldTrans = $inbase.$file."/Transcripts/"; if (! -e $docdir){ `mkdir -p $docdir`; } if (! -e $newMDdir){ `mkdir -p $newMDdir`; } # we also need to find scans directories. What are they? undef @scandirs; opendir(COLL, $olddir) or die "can't look in $olddir\n"; while ($file = readdir(COLL)){ if ($file =~ /^\./){ next;} # no dot directories if ($file =~ /.*?Scan.*?/i){ $adir = $olddir.$file; if ( -d $adir){ # make sure this is a directory push (@scandirs, $adir); } } } close(COLL); # we need to look for $base.$file."/Admin/$collnum\.xml # to extract the title for the collection # NOTE!! this does NOT pick up collnum.2.xml, collnum.3.xml, etc. # if this collection is another from the same analog collection, there may be different numbers $collinfo = $admindir."/".$collnum.".xml"; print "looking for $collinfo\n"; $found = 1; if (! (-e $collinfo)){ undef $found; for ($i=1; $i < 10; $i++){ $testme = "$admindir/$collnum.$i.xml"; if (-e $testme){ $collinfo = $testme; $found = 1; $last; } } # we need a copy of the collection file to create/alter manifests, with each dump. if (! $found){ undef $title; print "No admin xml file for $collnum; looking in database\n"; $id = $dbh->quote($collnum); $sth = $dbh->prepare("select title from digColls where id_2009 like $id") or die "can't prepare select for $collnum to see if it's up: ", $dbh->errstr(),"\n"; $sth->execute() or die "can't select to see if $collnum is up: ", $dbh->errstr(),"\n"; ($title) = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); if ($title){ print "$title is already in the database\n"; $sth->finish(); goto THISPART; } else{ $sth->finish(); die "not in database, and no xml file in $admindir\n"; } } } # HERE!! CHeck to see if each file begins with: # # # and ends with # if NOT, add them in ## --- ALL THIS IS IN ANOTHER SCRIPT NOW -- just need the title open (INFO, $collinfo) or die "can't open $collinfo\n"; undef $xml; undef $parentstart; undef $parentend; undef $title; undef @thisfile; while ($line = ){ $line =~ s,\r,,g; # no Windows newlines $line =~ s,\. \"(\s),\.\"$1,g; # no space between period and quote # try to repair MS word encodings of hyphens, quotes, apostrophes $line =~ s,\342\200\231,',g; # if you hexdump the file, in place of an apostrophe # you will see in the word line: 342 200 231 # hexdump -cox filename > output # gives octal, hex, and characters # or you can just hexdump -c and look for those goofy things $line =~ s,\342\200\230,',g; $line =~ s,\342\200\235,",g; $line =~ s,\342\200\234,",g; $line =~ s,\342\200\233,\-\-,g; $line =~ s,\342\200\224,\-\-,g; $line =~ s,\342\200\223,\-\-,g; $line =~ s,\342\200\246,\-,g; $line =~ s,\357\277\275,\',g; $line =~ s,\222,\',g; # shows up as <92> $line =~ s,\226,\-,g; # shows up as <96> $line =~ s,> *"(.*)" *<,>\1<,; # try to remove extraneous quotes $line =~ s, \& , \&\; ,g; #encode ampersand if ($line =~ /Digital\_Collection\_Name/){ ( $title = $line) =~ s,\<\/?Digital\_Collection\_Name\>,,g; chomp $title; } } close (INFO); THISPART: if (!$title){ die "ERROR, no $title from $collinfo\n";} # add titles for manifests if (${$makeMan{$upperman}}{$collnum}){ ${$makeMan{$upperman}}{$collnum} = $title;} if (${$alterMan{$upperman}}{$collnum}){ ${$alterMan{$upperman}}{$collnum} = $title;} # print "$collnum --> $title\n"; $manifest = $docdir."/Manifest.html"; $newbie = 1; # if this value exists, it's a new manifest being written # the only manifest not in this pattern is u0003_0000252, cabaniss, it's old. if (-e $manifest){ # we don't want to overwrite an existing manifest. # how do I add to one that exists???? HERE undef $newbie; open (MAN, $manifest) or die "can't read in $manifest\n"; # note that we may not need ALL these levels, but we need to support this many, just in case undef @level1; # end just after "Administrative Information" -- add more here. undef @level2; # end before transcripts or item level metadata; contains collection metadata undef @level3; # end before item level metadata; contains transcripts undef @level4; #end before content; contains item level metadata undef @level5; # end before last undef @level6; # last lines undef $itemMDExists; undef $transcriptsExist; $level = 1; while ($m = ){ if ($m =~ /Administrative Information *<\/h3>/){ push (@level1, $m); $level = 2; } elsif ($m =~ /Collection Level Metadata *<\/h3>/){ push (@level2, $m); $level = 3; } elsif ($m =~ /Metadata *<\/h3>/){ $itemMDExists = 1; $level = 4; } elsif ($m =~ /Transcripts *<\/h3>/){ $transcriptsExist = 1; $level = 5; } elsif ($m =~ /Content *<\/h3>/){ $level = 6; } elsif ($level == 1){ push(@level1, $m);} elsif ($level == 2){ push(@level2, $m);} elsif ($level == 3){ push(@level3, $m);} elsif ($level == 4){ push(@level4, $m);} elsif ($level == 5){ push(@level5, $m);} elsif ($level == 6){ push(@level6, $m);} } close(MAN); # now, start to rewrite it: if (! $test){ open (OUT, ">".$manifest) or die "can't write to $manifest\n"; } foreach (@level1){ print OUT $_;} } else{ if (! $test){ open(OUT, ">".$manifest) or warn "can't open $manifest to write.\n";} $head = ' '.$title.' Manifest Page

'.$title.' '.$collnum.' Manifest Page

'; $tail = '

LOCKSS system has permission to collect, preserve, and serve this Archival Unit

'; print OUT $head; } # now, move other content from the admin folder opendir(ADMIN, $admindir) or die "can't open $admindir\n"; if ($newbie){ print OUT " \n"; } else{ foreach (@level2){ print OUT $_;} # this prints the rest of the existing Admin data, and starts the # collection level metadata section } # next, metadata if (-e $oldMDdir){ undef $metsdir; opendir(MD, $oldMDdir) or die "can't open $oldMDdir\n"; if ($newbie){ print OUT " \n"; } else{ foreach (@level3){ print OUT $_;} # prints existing coll-level metadata -- then start of next section if ($itemMDexists){ print OUT "