#!/usr/bin/perl use File::Copy; use DBI; # relocating # jody DeRidder, 6/11/09 # this version to reorder content from share drive # into archival storage system # first, do a diff from /srv/deposits/content to the share drive area # to see if we got the content ok; then delete off share # then go to the collstuff directory ../collstuff and run collToDbase # to put the collection info in the database. # MODIFY to add links for online in Tonio's software # WARNING! Before running this, do a chmod -R 755 /archive # after running it, do chmod -R 555 /archive # this will enable you to write to the directories as root -- # and then close off that ability afterwards. # after writing this, add links/manifests as needed on top level of /srv/www/htdocs/lockss/ directories # (should be indicated by this script's output) # check links at http://libcontent1.lib.ua.edu/lockss/Manifest.html # then run checkem in this directory -- it will verify md5 sums of content that was moved, # and delete from the deposits directory. # then go look in the deposits directory, make sure folders are clean, and delete them. # Note that this EXPECTS scans in a Scans directory!! not on top level. #$test = 1; # location hardcoded here: $inbase = "/srv/deposits/content/"; $outbase = "/srv/archive/"; $linkbase = "http://libcontent1.lib.ua.edu/lockss"; $moveme = "./moveme"; # this is the list of copied files to be checked via md5; if they are the same, the old file will be deleted. If not, an error is issued. $hostname = "localhost"; $port = "3306"; $user = "username"; $password = "password"; $database = "InfoTrack"; #$oaibase = "oai:content.lib.ua.edu:$m/"; # add dnum $dbh = DBI->connect("DBI:mysql:$database:$hostname:$port", $user, $password) or die "can't connect to database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; open (MOVE, ">".$moveme) or die "can't add to $moveme\n"; # COMMENT OUT and uncomment manifest line below when ready to write manifests if ($test){ open (OUT, ">>RelocateManifests") or die "can't write to RelocateManifests \n";} opendir (BASE, $inbase) or die "can't open $inbase\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./ ){ next; } # skip dot files # print "looking at $file\n"; if ($file =~ /^([a-z]{1}[\d]{4})\_([\d]{7})(\_)?([\d]{7})?/){ $topdir = $1; $secdir = $2; $thirdir = $4; if (! $thirdir){ $collnum = $topdir."_".$secdir; $collbase = $outbase.$topdir."/".$secdir."/"; } else{ $collnum = $topdir."_".$secdir."_".$thirdir; $collbase = $outbase.$topdir."/".$secdir."/".$thirdir."/"; $middleman = $outbase.$topdir."/".$secdir."/Documentation/Manifest.html"; $upperman = $outbase.$topdir."/Documentation/Manifest.html"; if (-e $upperman){ # need to alter this manifest to link in this collection print "ALTER MANIFEST: add $collnum to $upperman\n"; ${$alterMan{$upperman}}{$topdir."_".$secdir} = " "; # add title later -- is there one? } else{ # need to create this manifest print "CREATE MANIFEST: add $collnum to $upperman\n"; ${$makeMan{$upperman}}{$topdir."_".$secdir} =" "; # add title later -- is there one? } $olddir = $inbase.$file."/"; if (! -e $collbase){ `mkdir -p $collbase`; } # we will look here for png, jpg, gif, txt, rtf files # rename the image and text files for the collection number, and # store under "Documentation" at the collection level $admindir = $inbase.$file."/Admin"; # print "old admin dir for $file is $admindir\n"; $docdir = $collbase."Documentation"; $oldMDdir = $inbase.$file."/Metadata/"; $newMDdir = $collbase."Metadata"; $oldTrans = $inbase.$file."/Transcripts/"; if (! -e $docdir){ `mkdir -p $docdir`; } if (! -e $newMDdir){ `mkdir -p $newMDdir`; } # we also need to find scans directories. What are they? undef @scandirs; opendir(COLL, $olddir) or die "can't look in $olddir\n"; while ($file = readdir(COLL)){ if ($file =~ /^\./){ next;} # no dot directories if ($file =~ /.*?Scan.*?/i){ $adir = $olddir.$file; if ( -d $adir){ # make sure this is a directory push (@scandirs, $adir); } } } close(COLL); # we need to look for $base.$file."/Admin/$collnum\.xml # to extract the title for the collection # NOTE!! this does NOT pick up collnum.2.xml, collnum.3.xml, etc. # if this collection is another from the same analog collection, there may be different numbers $collinfo = $admindir."/".$collnum.".xml"; print "looking for $collinfo\n"; $found = 1; if (! (-e $collinfo)){ undef $found; for ($i=1; $i < 10; $i++){ $testme = "$admindir/$collnum.$i.xml"; if (-e $testme){ $collinfo = $testme; $found = 1; $last; } } # we need a copy of the collection file to create/alter manifests, with each dump. if (! $found){ undef $title; print "No admin xml file for $collnum; looking in database\n"; $id = $dbh->quote($collnum); $sth = $dbh->prepare("select title from digColls where id_2009 like $id") or die "can't prepare select for $collnum to see if it's up: ", $dbh->errstr(),"\n"; $sth->execute() or die "can't select to see if $collnum is up: ", $dbh->errstr(),"\n"; ($title) = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); if ($title){ print "$title is already in the database\n"; $sth->finish(); goto THISPART; } else{ $sth->finish(); die "not in database, and no xml file in $admindir\n"; } } } # HERE!! CHeck to see if each file begins with: # # # and ends with # if NOT, add them in ## --- ALL THIS IS IN ANOTHER SCRIPT NOW -- just need the title open (INFO, $collinfo) or die "can't open $collinfo\n"; undef $xml; undef $parentstart; undef $parentend; undef $title; undef @thisfile; while ($line = ){ $line =~ s,\r,,g; # no Windows newlines $line =~ s,\. \"(\s),\.\"$1,g; # no space between period and quote # try to repair MS word encodings of hyphens, quotes, apostrophes $line =~ s,\342\200\231,',g; # if you hexdump the file, in place of an apostrophe # you will see in the word line: 342 200 231 # hexdump -cox filename > output # gives octal, hex, and characters # or you can just hexdump -c and look for those goofy things $line =~ s,\342\200\230,',g; $line =~ s,\342\200\235,",g; $line =~ s,\342\200\234,",g; $line =~ s,\342\200\233,\-\-,g; $line =~ s,\342\200\224,\-\-,g; $line =~ s,\342\200\223,\-\-,g; $line =~ s,\342\200\246,\-,g; $line =~ s,\357\277\275,\',g; $line =~ s,\222,\',g; # shows up as <92> $line =~ s,\226,\-,g; # shows up as <96> $line =~ s,> *"(.*)" *<,>\1<,; # try to remove extraneous quotes $line =~ s, \& , \&\; ,g; #encode ampersand if ($line =~ /Digital\_Collection\_Name/){ ( $title = $line) =~ s,\<\/?Digital\_Collection\_Name\>,,g; chomp $title; } } close (INFO); THISPART: if (!$title){ die "ERROR, no $title from $collinfo\n";} # add titles for manifests if (${$makeMan{$upperman}}{$collnum}){ ${$makeMan{$upperman}}{$collnum} = $title;} if (${$alterMan{$upperman}}{$collnum}){ ${$alterMan{$upperman}}{$collnum} = $title;} # print "$collnum --> $title\n"; $manifest = $docdir."/Manifest.html"; $newbie = 1; # if this value exists, it's a new manifest being written # the only manifest not in this pattern is u0003_0000252, cabaniss, it's old. if (-e $manifest){ # we don't want to overwrite an existing manifest. # how do I add to one that exists???? HERE undef $newbie; open (MAN, $manifest) or die "can't read in $manifest\n"; # note that we may not need ALL these levels, but we need to support this many, just in case undef @level1; # end just after "Administrative Information" -- add more here. undef @level2; # end before transcripts or item level metadata; contains collection metadata undef @level3; # end before item level metadata; contains transcripts undef @level4; #end before content; contains item level metadata undef @level5; # end before last undef @level6; # last lines undef $itemMDExists; undef $transcriptsExist; $level = 1; while ($m = ){ if ($m =~ /Administrative Information *<\/h3>/){ push (@level1, $m); $level = 2; } elsif ($m =~ /Collection Level Metadata *<\/h3>/){ push (@level2, $m); $level = 3; } elsif ($m =~ /Metadata *<\/h3>/){ $itemMDExists = 1; $level = 4; } elsif ($m =~ /Transcripts *<\/h3>/){ $transcriptsExist = 1; $level = 5; } elsif ($m =~ /Content *<\/h3>/){ $level = 6; } elsif ($level == 1){ push(@level1, $m);} elsif ($level == 2){ push(@level2, $m);} elsif ($level == 3){ push(@level3, $m);} elsif ($level == 4){ push(@level4, $m);} elsif ($level == 5){ push(@level5, $m);} elsif ($level == 6){ push(@level6, $m);} } close(MAN); # now, start to rewrite it: if (! $test){ open (OUT, ">".$manifest) or die "can't write to $manifest\n"; } foreach (@level1){ print OUT $_;} } else{ if (! $test){ open(OUT, ">".$manifest) or warn "can't open $manifest to write.\n";} $head = ' '.$title.' Manifest Page

'.$title.' '.$collnum.' Manifest Page

'; $tail = '

LOCKSS system has permission to collect, preserve, and serve this Archival Unit

'; print OUT $head; } # now, move other content from the admin folder opendir(ADMIN, $admindir) or die "can't open $admindir\n"; if ($newbie){ print OUT " \n"; } else{ foreach (@level2){ print OUT $_;} # this prints the rest of the existing Admin data, and starts the # collection level metadata section } # next, metadata if (-e $oldMDdir){ undef $metsdir; opendir(MD, $oldMDdir) or die "can't open $oldMDdir\n"; if ($newbie){ print OUT " \n"; } else{ foreach (@level3){ print OUT $_;} # prints existing coll-level metadata -- then start of next section if ($itemMDexists){ print OUT "