#!/usr/bin/perl use Time::Local; # testDeposits # a version of testIncoming for DS staff to use on deposits directory # locate all the bad filenames # things in the wrong directory # things missing # bad sequences # jody DeRidder, 8/25/09 ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 6/10/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ×tamp; #print $timestamp."\n"; $output = "./output/IncomingTest_".$timestamp.".txt"; open (OUT, ">".$output) or die "can't open $output\n"; $base = "/srv/deposits/content/"; opendir(BASE, $base) or die "can't look through $base\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./){next;} # skip dot files $path = $base.$file; if (-d $path){ push (@dirs, $file."/"); } } close(BASE); print "Results will be in the $output file....\n when you see a \"Good bye!\" and this screen closes.\n"; print "Don't open it yet! :-) \n\n"; foreach $topdir (@dirs){ undef (@mydirs); $mybase = $base.$topdir."/"; $thisdir = $mybase; opendir(THIS, $thisdir) or die "can't open $thisdir\n"; while ($file = readdir(THIS)){ if ($file =~ /^\./){next;} # skip dot files $path = $thisdir.$file; if (-d $path){ push (@mydirs, $path."/"); } } close(THIS); # test here to see if this directory has filename in it... # if not, ask for file name structure to test # testing #$thisdir = "/cifs-mount/Digital_Coll_in_progress/u0003_0000753_Jemison_1800/"; #push (@mydirs, $thisdir); if ($thisdir =~ /([a-z]{1}\d{4}\_\d{7}(\_\d{7})?)/){ $coll = $1; print "Looking at $thisdir: Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; while ($answer =~ /n/i){ undef $coll; print "Please enter the collection identifier\n"; $coll = ; chop $coll; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; } } if (!$coll){ print "Please enter the collection identifier\n"; $coll = ; chop $coll; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; while ($answer =~ /n/i){ undef $coll; print "Please enter the collection identifier\n"; $coll = ; chop $coll; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; } } print "\nWe will test the files against collection identifier $coll.\n\n"; print ". . . working . . . (be patient) . . . \n\n"; foreach $dir (@mydirs){ #HERE: do a section for Admin, and one for Metadata $count = 1; # checking sequence count of files in directories opendir (DIR, $dir) or die "can't read files in $dir\n"; while ($adir = readdir(DIR)){ if ($adir =~ /^\./ || $adir =~ /Thumbs\.db/i){next;} #skip dot files and thumbs.db # print "looking at $adir\n"; ($short = $dir) =~ s,\/srv\/deposit\/content\/,,; $parent = $short; $parent =~ s,\/,\_,g; if ($parent =~ /.*?$coll(.*)([a-z]{1}\d{4}\_\d{7}\_\d{7}.*)/){ $parent = $2; # print OUT "parent $parent for $short\n"; } else{ ($parent = $short) =~ s,.*\/,,; # parent directory is what follows the last slash, yes? } # we want to match for this $thispath = $dir."/".$adir; if ($adir =~ /Scans/i || $adir =~ /Transcripts/i){ push (@mydirs, $thispath); next; } elsif ($adir =~ /Admin/){ opendir(ADMIN, $thispath) or die "can't look through $thispath\n"; undef $found; while ($afile = readdir(ADMIN)){ if ($file =~ /^\./){ next;} if ($file =~ /^$coll\.xml$/){ $found = 1; } elsif ($file =~ /^$coll\.\d{1,2}\.xml$/){ $found = 1; } } if (!$found){ print "\n\nIs this collection already online? Y or N\n"; $ans = ; if ($ans =~ /n/i){ push (@missing, "$coll.xml missing from Admin\n"); } } close(ADMIN); } elsif ($adir =~ /Metadata/){ opendir(MD, $thispath) or die "can't look through $thispath\n"; undef $found; while ($afile = readdir(MD)){ if ($file =~ /^\./){ next;} if ($file =~ /^$coll\.txt$/){ $found = 1; } elsif ($file =~ /^$coll\.\d{1,2}\.txt$/){ $found = 1; } } if (!$found){ push (@missing, "$coll.txt (collection spreadsheet) missing from Metadata folder\n");} close(MD); } # this works on items and item directories, collecting item numbers into @itemnums elsif ($adir =~ /$coll/){ # passes first test, it matches collname if ($adir =~ /.*\_(\d{7})(\..{3})?$/ && (!($adir =~ /\.txt/))) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "I think that the number for $adir is $mynum\n"; if (!($thispath =~ /Metadata/ || $thispath =~ /Transcripts/ || $thispath =~ /Admin/)){ push (@itemnums, $mynum); } } # this works on page pages, assumes the parent item number is the directory it's in # and puts this page number into an array using the item number as key in the hash %itempages elsif ($adir =~ /.*\_(\d{4})(\..{3})$/ && (!($adir =~ /\.txt/))) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "$parent is parent: I think that the number for $adir is $mynum\n"; if (!($thispath =~ /Metadata/ || $thispath =~ /Transcripts/ || $thispath =~ /Admin/)){ push (@{$itempages{$parent}}, $mynum); } } # this is a subpage elsif ($adir =~ /.*\_(\d{3})(\..{3})$/ && (!($adir =~ /\.txt/))) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "$thispath: I think that the number for $adir is $mynum\n"; if (!($thispath =~ /Metadata/ || $thispath =~ /Transcripts/ || $thispath =~ /Admin/)){ push (@{$itemsubpages{$parent}}, $mynum); } } # if content is in Box/Folder form, need to run a different script elsif ($adir =~ /Box/i || $adir =~ /Folder/i){ push (@mass, $parent); next; } if ( -d $thispath){ if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?$/)){ if (!($thispath =~ /Metadata/ || $thispath =~ /Transcripts/ || $thispath =~ /Admin/)){ push (@badform, $adir); } } push (@mydirs, $thispath); # collect subdirectories for further investigation } else{ # not a directory; must be a file. Does it match its parent directory? if ((!($parent =~ /Scans/i || $thispath =~ /Admin/ || $thispath =~ /Metadata/)) && (!($adir =~ /$parent/))){ push (@wrongdir, $adir." ".$short); } if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?(\.\d{1})?\.\w{3}$/ || $adir =~ /^[a-z]{1}\d{4}\_\d{7}\.\w{3}$/ || $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?(\.\d{1})?\.adl\.xml$/)){ push (@badform, $adir); } # pull off the last set of numbers before the extension elsif ($parent =~ /Scans/i || (!$parent)){ next;} # don't check non-compound objects for sequence } } elsif (!($parent =~ /Scans/i || $thispath =~ /Admin/ || $thispath =~ /Metadata/)){ push (@wrongdir, $adir." ".$short); } } close(DIR); } $count = 1; @sorteditems = sort by_number(@itemnums); foreach $mynum (@sorteditems){ if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "item level --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "item level --> $count is missing");} } $count = $mynum + 1; } @itemmoms = sort by_number(keys (%itempages)); foreach $item (@itemmoms){ $count = 1; @pages = @{$itempages{$item}}; $numpages = scalar (@pages); @sorted = sort by_number(@pages); foreach $mynum (@sorted){ if (! $mynum){ print "no page number for something in $item\n";} if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum page in $item item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "pages of $item --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "pages of $item --> $count is missing");} } $count = $mynum + 1; } undef @{$itempages{$item}}; } @pages = sort by_number(keys (%itemsubpages)); foreach $page (@pages){ $count = 1; @subpages = @{$itemsubpages{$page}}; $numpages = scalar (@pages); @sorted = sort by_number(@subpages); foreach $mynum (@sorted){ if (! $mynum){ print "no subpage number for something in $page\n";} if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum subpage in $page, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "subpages of $page --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "subpages of $page --> $count is missing");} } $count = $mynum + 1; } undef @{$itemsubpages{$page}}; } undef @itemnums; undef @itempages; if (@badform || @badcount || @wrongdir || @missing || @mass ){ print OUT "\nTROUBLE: $topdir \n"; print OUT "-------------------------------------------\n"; } else{ print OUT "All is GREAT in $topdir! :-) \n"; } if (@mass){ print OUT "\n This collection needs to be tested with CheckBoxFolder instead!\n"; foreach (@mass){ print OUT " $_\n";} } if (@missing){ print OUT "\nThe following files are missing or badly named \n"; foreach (@missing){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; print OUT "Are they in the right place? Please check:\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nSome files seem to be missing, according to sequence gaps\n"; foreach (@badcount){ print OUT " $_\n";} } undef @badform; undef @wrongdir; undef @badcount; undef @missing; undef @mass; } # do the next top directory close OUT; print "Good bye!\n"; exit; sub timestamp{ #following for Windows #print "hit enter twice please\n"; #$date = `date`; #$time = `time`; #print $date."\n"; #if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ # $date = $3.$1.$2; # } #print $date."\n"; #if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ # $time = $1.$2.$3; # } #print $time."\n"; #$timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); ##$year += 1900; $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour$min$sec\Z"; } sub by_number {$a <=> $b;}