#!/usr/bin/perl use DBI; use File::Copy; #processEtds # looks in subdirectory of incoming # locates optional directories # asks which one # sets up directories in working: top directory matches date of selected directory to work # within it, PRQ and CONTENT and OPEN # picks up zip files one by one from incoming directory indicated # opens in OPEN using unzip -d filename (minus the zip) so we can capture the directory name # goes into said directory; counts the number of files. Error if no pdf or no .*DATA.xml. # more than 2 files indicates subsidiary files; capture that count for the spreadsheet. # capture original filename. # get today's date; estimate this will be on the web the next month. ? Capture this. # open the DATA.xml file. # pull embargo code from 2nd line: # if embargo code does not equal zero, calculate the date embargo lifts and adjust the captured web # availablity date. # this is the line where I am to insert attributes UA_identifier="NewFilename" and UA_purl="http://purl.lib.ua.edu/num" -- so don't write to new file in PRQ yet. # read in lines till I get to which is the end of the first name entered in , firstname from , # middle name from , suffix from . Compile these into: # Lastname(followed by space Suffix if there is one), Firstname Middle -- this is the author. # then look for and pull that for the title # then for completion date # DISS_accept_date for acceptance date # look for subsidiary file info. description, filename, filetype # if none, at least get extension and original filename, and use these in the renaming process. # call InfoTrack.bornDigital and check to see if this file has been entered. # if not, get next filenumber, enter this one, and use lookup to get a PURL. # also enter date this is to go live on the web. This will be used by the crawling script # to determine if embargo is lifted. # write this XML file into the PRQ directory with the new filename.prq.xml, adding # new filename and purl into the connect ("dbi:mysql:InfoTrack:libcontent1.lib.ua.edu:3306", $dbuser, $dbpass) or die "can't connect from encompass to mysql on libcontent1 --InfoTrack: ".$dbh->error(); $h->{PrintError} = 1; $h->{RaiserError} = 1; # get today's date ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst)=localtime(); $year += 1900; $mon += 1; if ($mon =~ /^[1-9]{1}$/){ $mon = "0".$mon;} if ($mday =~ /^[1-9]{1}$/){ $mday = "0".$mday;} #$if ($hour =~ /^[1-9]{1}$/){ $hour = "0".$hour;} #if ($min =~ /^[1-9]{1}$/){ $min = "0".$min;} #if ($sec =~ /^[1-9]{1}$/){ $sec = "0".$sec;} #print "$year-$mon-$mday\n"; #:$hour $min $sec\n"; $today = $year.$mon.$mday; # looks in subdirectory of incoming # locates optional directories # asks which one # sets up directories in working: top directory matches date of selected directory to work # within it, PRQ and CONTENT and OPEN opendir (STUFF, "../etd_deposits/") or die "can't look in incoming\n"; $count = 1; while ($file = readdir(STUFF)){ if ($file =~ /^\./){next;} push (@gather, $file); } @subdirs = sort(@gather); # $subdirs{$count} = $file; # $count ++; # } close(STUFF); print "Which directory shall we work?\n Pick a number and hit enter\n"; foreach $dir (@subdirs){ print "$count: $dir\n"; $match{$count} = $dir; $count ++; } #@numdirs = sort keys (%subdirs); #foreach $num (@numdirs){ # print "$num: ".$subdirs{$num}."\n"; # } $num = ; #$num = 1; chomp($num); if (! $match{$num}){ print "ERROR: $num directory does not exist.\n"; exit; } else{ $embargoStart = $year."0601"; #embargo start date is June 1 of current year. # embargoes can be 6 months, 1 year, or 2 years. #embargo_code (0|1|2|3|4) # default 0 # ProQuest comments: Embargo code can be 0 corresponding to no embargo OR # 1 - 6 months embargo # 2 - 1 year embargo # 3 - 2 year embargo # 4 - Reserved for future use #May submissions, embargo starts from June 1st, as you indicated. #August submissions, embargo starts from September 1st. #December submissions, embargo will run from January 1st. $enum = 4; while ($enum > 3){ print "\nPick a date from which to start the embargo:\n"; print "1) May submissions start June 1\n"; print "2) August submissions start September 1\n"; print "3) December submissions start January 1\n"; print "Which is this? Enter 1, 2, or 3, and hit enter\n\n"; $enum = ; chomp($enum); if ($enum > 3){ print "$enum is not a valid selection. Try again.\n";} } if ($enum eq 1){ # calculate date of embargo lift; $embargo{"0"} = $year."0601"; $embargo{"1"} = $year."1201"; $embargo{"2"} = ($year+1)."0601"; $embargo{"3"} = ($year+2)."0601"; } elsif ($enum eq 2){ $embargo{"0"} = $year."0901"; $embargo{"1"} = ($year+1)."0301"; $embargo{"2"} = ($year+1)."0901"; $embargo{"3"} = ($year+2)."0901"; } elsif ($enum eq 3){ # if submitted in December, we will be processing in January of the following year. $embargo{"0"} = $year."0101"; $embargo{"1"} = $year."0701"; $embargo{"2"} = ($year+1)."0101"; $embargo{"3"} = ($year+2)."0101"; } else{ die "embargo date code $enum does not correspond to any choice\n";} } $workme = $match{$num}; $indir = "../etd_deposits/$workme/"; $workdir = "../working/$workme/"; if (! -e $workdir){ `mkdir $workdir`;} $open = $workdir."OPEN/"; $prq = $workdir."PRQ/"; $content = $workdir."CONTENT/"; if (! -e $open){ `mkdir $open`;} if (! -e $prq){ `mkdir $prq`;} if (! -e $content){ `mkdir $content`;} print "\nWhen this script ends,\nyou will find the files you need in $workdir\n\n"; # a new spreadsheet each time. $spreadsheet = "../working/$workme/xmlList.xml"; open (OUT, ">".$spreadsheet) or die "cannot write to $spreadsheet\n"; print OUT ' '; # picks up zip files one by one from incoming directory indicated # opens in OPEN using unzip -d filename (minus the zip) so we can capture the directory name # goes into said directory; counts the number of files. Error if no pdf or no .*DATA.xml. # more than 2 files indicates subsidiary files; capture that count for the spreadsheet. # capture original filename. # files look like this: etdadmin_upload_10193.zip # pull off number after upload_ and before .zip, use that for directory, and store it # unzip -d 10193 etdadmin_upload_10193.zip opendir (IN, $indir) or die "can't look in $indir\n"; while ($file = readdir(IN)){ if ($file =~ /^\./){next;} if ($file =~ /etdadmin_upload_(.*?)\.zip/){ $dirname = $1; $mydir = $open.$dirname; $path = $indir.$file; `unzip -d $mydir $path`; $workdirs{$mydir} = $dirname; } } close(IN); # now look through the new files... # a subdir will look like this: #working/20090629/10999: #Wilson_alatus_0004D_10052_DATA.xml Wilson_alatus_0004D_10052.pdf Wilson_alatus_0004D_176 # where the last is another subdir containing supplementary files: #[root@encompass etds]# ls working/20090629/*/*176 #working/20090629/10386/Phelps_alatus_0004M_176: #Caravaggio_Doubting_Thomas.jpg # #working/20090629/10999/Wilson_alatus_0004D_176: #Appendixesfordissertation.pdf @workus = keys (%workdirs); foreach $dir (@workus){ undef %subfiles; undef $mypdf; undef $myxml; undef $origname; opendir (DIR, $dir) or die "can't look in $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./){next;} $path = $dir."/".$file; if (-d $path){ opendir (SUB, $path) or die "can't look through $path\n"; while ($bfile = readdir(SUB)){ if ($bfile =~ /^\./){next;} $pathb = $path."/".$bfile; $subfiles{$bfile} = $pathb; # collect those, we'll go back and name /copy them } close(SUB); } elsif ($file =~ /^(.*?)\.xml/ ){ $origname = $1; $myxml = $path; } elsif ($file =~ /^(.*?)\.pdf/ ){ $mypdf = $path; } else{ print "WHAT IS THIS? $file in $dir\n";} } close(DIR); if (!( $mypdf && $myxml)){ die "ERROR: in $dir we're missing something: PDF: $mypdf, XML: $myxml\n";} # open the DATA.xml file. # pull embargo code from 2nd line: # if embargo code does not equal zero, calculate the date embargo lifts and adjust the captured web # availablity date. # this is the line where I am to insert attributes UA_identifier="NewFilename" and UA_purl="http://purl.lib.ua.edu/num" -- so don't write to new file in PRQ yet. # read in lines till I get to which is the end of the first name entered in , firstname from , # middle name from , suffix from . Compile these into: # Lastname(followed by space Suffix if there is one), Firstname Middle -- this is the author. # then look for and pull that for the title # then for completion date # DISS_accept_date for acceptance date # look for subsidiary file info. description, filename, filetype # looks like this # # Appendixesfordissertation.pdf # pdf # Appendix for Dissertation # # if filename doesn't match what you have in %subfiles, there's a problem. # save this info... where? # I think for now, just check to see file name matches what I found? Nah. # come back when I have to capture tech info for storage. open(DATA, $myxml) or die "can't read in $myxml!\n"; undef $title; undef $name; undef $fname; undef $lastname; undef $midname; undef $suffix; undef $completed; undef $accepted; undef $firstLine; undef $getAuthor; undef $embargoCode; undef $myembargo; undef @thisfile; while ($line = ){ $printme = 1; if ($line =~ /(.*?)<\/DISS\_title>/){ $title = $1;} elsif ($line =~ /(.*?)<\/DISS\_comp\_date>/){ $completed = $1;} elsif ($line =~ /(.*?)<\/DISS\_accept\_date>/){ $accepted = $1;} elsif ($line =~ /DISS\_author type=\"primary\"/){ $getAuthor = 1;} elsif ($getAuthor eq 1 && $line =~ /(.*?)<\/DISS\_surname>/){$lastname = $1;} elsif ($getAuthor eq 1 && $line =~ /(.*?)<\/DISS\_fname>/){$fname = $1;} elsif ($getAuthor eq 1 && $line =~ /(.*?)<\/DISS\_middle>/){$midname = $1;} elsif ($getAuthor eq 1 && $line =~ /(.*?)<\/DISS\_suffix>/){$suffix = $1;} elsif ($line =~ /<\/DISS\_name>/){ undef $getAuthor;} elsif ($line =~ /(/){ $firstline = $1; $embargoCode = $2; $myembargo = $embargo{$embargoCode}; # print "embargo for $myxml is $embargoCode! so my release date is $myembargo\n"; undef $printme; } elsif ($line =~ /<\?xml version=/){ undef $printme;} # \"1\.0\" encoding="iso-8859-1"?> if ($printme){ push (@thisfile, $line); } } close(DATA); if (!$title){ die "no title for $myxml\n";} if (!$lastname){ die "no last name for $myxml\n";} if (!$fname){ die "no first name for $myxml\n";} if (!$myembargo){ die "no embargo code found! $myxml\n";} # now.... database stuff, then write the altered DATA file to a new filename in PRQ, and copy/rename # the pdf and supplementary files. And write to xmlList.xml if ($suffix){ $last = $dbh->quote($lastname." ".$suffix); $name = $lastname." ".$suffix.", "; } else{ $last = $dbh->quote($lastname); $name = $lastname.", "; } if (!$midname){ $first = $dbh->quote($fname); $name .= $fname; } else{ $first = $dbh->quote($fname." ".$midname); $name .= $fname." ".$midname; } $mytitle = $dbh->quote($title); $myavail = $dbh->quote($myembargo); undef $mypurl; undef $myId; $coll = $dbh->quote("u0015_0000001"); $huntthis = $dbh->quote("u0015_0000001_%"); $sth= $dbh->prepare("select lastName, firstName, title, id_2009, datestamp from bornDigital where lastName like $last and firstName like $first"); $sth->execute or die( "can't select to see if $myxml is in InfoTrack: ".dbh->errstr()); ($alname, $afname, $atitle, $anId, $ds) = $sth->fetchrow_array(); $sth->finish; if ($anId){ # print "$myxml is already assigned!!\n"; # foreach (@inthere){ print "$_ ";} # print "\n\n"; # print "I believe the id for this is $anId.\n"; $myId = $anId; $id_2009 = $dbh->quote($myId); #HERE -- pull out the id_2009 from this, and get the PURL from lookup table $sth= $dbh->prepare("select purlnum from lookup where id_2009 like $id_2009"); $sth->execute or die( "can't select purlnum from lookup for $id_2009: ".$dbh->errstr()); @resp = $sth->fetchrow_array(); if (! @resp){ die "no purlnum for $id_2009 in lookup! \n";} else{ $val = $resp[0]; $val ++; $mypurl = "http://purl.lib.ua.edu/".$val; } $sth->finish; # print "and my purl, already assigned, is $mypurl\n"; } else{ $sth= $dbh->prepare("lock tables lookup as look2 read, lookup write, bornDigital write"); $sth->execute or die( "can't lock tables from lookup: ".$dbh->errstr()); # $sth->finish(); $sth= $dbh->prepare("select max(id_2009) from bornDigital where id_2009 like $huntthis"); $sth->execute or die( "can't select maxid from lookup: ".$dbh->errstr()); @resp = $sth->fetchrow_array(); if (! @resp){ die "no maxid in lookup! \n";} else{ $val = $resp[0]; if ($val =~ /^u0015\_0000001\_(\d*)$/){ $increment = $1 +1; $newnum = sprintf ("%07d", $increment); $myId = "u0015_0000001_".$newnum; $mylink = "http://acumen.lib.ua.edu/".$myId; # print "my number will be $myId and my link will be $mylink\n"; } else { die "What IS this max ID from lookup? $val\n";} } $sth->finish; $id_2009 = $dbh->quote($myId); # print "insert into bornDigital (dnum, id_2009, lastName, firstName, collnum, title, dateAvailable) values (NULL, $id_2009, $last, $first, $coll, $mytitle, $myavail)\n"; $sth= $dbh->prepare("insert into bornDigital (dnum, id_2009, lastName, firstName, collnum, title, dateAvailable) values (NULL, $id_2009, $last, $first, $coll, $mytitle, $myavail)"); $sth->execute or die( "can't insert into bornDigital for $id_2009: ".$dbh->errstr()); $sth->finish; # if (! $myId){ die "$myxml has no id.\n";} # else{ print "$myxml will be $myId\n";} # now get a purl # print "select max(purlnum) from lookup as look2\n"; $sth= $dbh->prepare("select max(purlnum) from lookup as look2"); $sth->execute or die( "can't get max purlnum from lookup : ".$dbh->errstr()); @resp = $sth->fetchrow_array(); $lastnum = $resp[0]; $sth->finish; if (! $lastnum){ die "no last purlnum\n";} $lastnum ++; $mypurl = "http://purl.lib.ua.edu/".$lastnum; # print "my purlnum will be $lastnum, and my purl will be $mypurl\n"; $purlnum = $dbh->quote($lastnum); $realurl = $dbh->quote($mylink); # print "insert into lookup (NULL, id_2009, purlnum, realurl) values (NULL, $id_2009, $purlnum, $mylink)\n"; $sth= $dbh->prepare("insert into lookup (dnum, id_2009, purlnum, realurl) values (NULL, $id_2009, $purlnum, $realurl)"); $sth->execute or die( "can't insert into lookup for $myId: ".$dbh->errstr()); $sth->finish; $sth= $dbh->prepare("unlock tables"); $sth->execute or die( "can't unlock tables : ".$dbh->errstr()); $sth->finish; } # NOW -- write to spreadsheet, # get number of subsidiary files $numkids = scalar keys(%subfiles); print OUT ' '.$myId.' '.$origname.' '.$name.' '.$title.' '.$workdirs{$dir}.' '.$completed.' '.$accepted.' '.$numkids.' '.$embargoCode.' '.$myembargo.' '.$mypurl.' '; # then rename files and write out the data file. $mynewxml = $prq.$myId.".prq.xml"; open (XML, ">".$mynewxml) or die "can't write to $mynewxml\n"; print XML "\n"; $firstline .= " UA_identifier=\"$myId\" UA_purl=\"$mypurl\" >\n"; print XML $firstline; foreach (@thisfile){ print XML $_;} close(XML); $mynewpdf = $content.$myId.".pdf"; copy($mypdf, $mynewpdf) or warn "could not copy $mypdf to $mynewpdf\n"; $count = 1; while (($kid, $path) = each(%subfiles)){ ($ext = $kid) =~ s/^.*?(\..{2,5})$/\1/; if (! $ext){ warn "could find no extension for subsidiary file $kid. No rename. GO GET IT!\n";} else{ $newnum = sprintf ("%04d", $count); $newkid = $content.$myId."_".$newnum.$ext; # print "\nextension for $kid is $ext -- rename will be $myId$ext. \n"; $old = $path; copy ($path, $newkid) or warn "could not copy $path to $newkid\n"; $count ++; } } } print OUT "\n"; close (OUT); $dbh->disconnect();