#!/usr/bin/perl use Time::Local; # testIncoming # locate all the bad filenames # things in the wrong directory # things missing # bad sequences # jody DeRidder, 8/25/09 ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 7/30/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ×tamp; print $timestamp."\n"; $output = "./output/IncomingTest_".$timestamp.".rtf"; open (OUT, ">".$output) or die "can't open $output\n"; $base = "/srv/deposits/content/"; opendir(BASE, $base) or die "can't look through $base\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./){next;} # skip dot files $path = $base.$file; if (-d $path){ push (@dirs, $file."/"); } } close(BASE); foreach $topdir (@dirs){ undef (@mydirs); $mybase = $base.$topdir."/"; $thisdir = $mybase; opendir(THIS, $thisdir) or die "can't open $thisdir\n"; while ($file = readdir(THIS)){ if ($file =~ /^\./){next;} # skip dot files $path = $thisdir.$file; if (-d $path){ push (@mydirs, $path."/"); } } close(THIS); # test here to see if this directory has filename in it... # if not, ask for file name structure to test if ($thisdir =~ /([a-z]{1}\d{4}\_\d{7}(\_\d{7})?)/){ $coll = $1; print "Looking at $thisdir: Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; while ($answer =~ /n/i){ undef $coll; print "Please enter the collection identifier\n"; $coll = ; chop $base; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; } } if (!$coll){ print "Please enter the collection identifier\n"; $coll = ; chop $base; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; while ($answer =~ /n/i){ undef $coll; print "Please enter the collection identifier\n"; $coll = ; chop $base; print "Is this the collection identifier? -->$coll\n"; print "Y or N, and press enter\n"; $answer = ; } } print "\nWe will test the files against collection identifier $coll.\n\n"; print "Results will be in the $output file....\n when you see a \"Good bye!\" and this screen closes.\n"; print "Don't open it yet! :-) \n\n"; print ". . . working . . . (be patient) . . . \n\n"; foreach $dir (@mydirs){ #HERE: do a section for Admin, and one for Metadata $count = 1; # checking sequence count of files in directories opendir (DIR, $dir) or die "can't read files in $dir\n"; while ($adir = readdir(DIR)){ if ($adir =~ /^\./ || $adir =~ /Thumbs\.db/i){next;} #skip dot files and thumbs.db # print "looking at $adir\n"; ($short = $dir) =~ s,\/srv\/deposit\/content\/,,; ($parent = $short) =~ s,.*\/,,; # parent directory is what follows the last slash, yes? # we want to match for this $thispath = $dir."/".$adir; if ($adir =~ /Scans/i || $adir =~ /Transcripts/i){ push (@mydirs, $thispath); next; } elsif ($adir =~ /Admin/){ opendir(ADMIN, $thispath) or die "can't look through $thispath\n"; undef $found; while ($afile = readdir(ADMIN)){ if ($file =~ /^\./){ next;} if ($file =~ /^$coll\.xml$/){ $found = 1; } elsif ($file =~ /^$coll\.\d{1,2}\.xml$/){ $found = 1; } } if (!$found){ push (@missing, "$coll.xml missing from $parent\n");} close(ADMIN); } elsif ($adir =~ /Metadata/){ opendir(MD, $thispath) or die "can't look through $thispath\n"; undef $found; while ($afile = readdir(MD)){ if ($file =~ /^\./){ next;} if ($file =~ /^$coll\.txt$/){ $found = 1; } elsif ($file =~ /^$coll\.\d{1,2}\.txt$/){ $found = 1; } } if (!$found){ push (@missing, "$coll.txt missing from $parent\n");} close(MD); } elsif ($adir =~ /$coll/){ # passes first test, it matches collname if ($adir =~ /.*\_(\d{7})(\..{3})?$/ && (!($adir =~ /\.txt/))) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "I think that the number for $adir is $mynum\n"; if (!($thispath =~ /Metadata/ || $thispath =~ /Transcripts/ || $thispath =~ /Admin/)){ push (@itemnums, $mynum); } } elsif ($adir =~ /.*\_(\d{4})(\..{3})?$/ && (!($adir =~ /\.txt/))) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "$parent is parent: I think that the number for $adir is $mynum\n"; if (!($thispath =~ /Metadata/ || $thispath =~ /Transcripts/ || $thispath =~ /Admin/)){ push (@{$itempages{$parent}}, $mynum); } } elsif ($adir =~ /.*\_(\d{3})(\..{3})?$/ && (!($adir =~ /\.txt/))) { # check for bad numbering $mynum = $1 + 0; # get rid of leading zeros # print "$thispath: I think that the number for $adir is $mynum\n"; if (!($thispath =~ /Metadata/ || $thispath =~ /Transcripts/ || $thispath =~ /Admin/)){ if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "Error: diff: $diff; mynum $mynum; count $count; $thispath $adir\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "$short $adir --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "$short $adir --> $count is missing");} } $count = $mynum + 1; } } if ( -d $thispath){ if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?$/)){ if (!($thispath =~ /Metadata/ || $thispath =~ /Transcripts/ || $thispath =~ /Admin/)){ push (@badform, $adir); } } push (@mydirs, $thispath); # collect subdirectories for further investigation } else{ # not a directory; must be a file. Does it match its parent directory? if ((!($parent =~ /Scans/i || $parent =~ /Admin/)) && (!($adir =~ /$parent/))){ push (@wrongdir, $adir." ".$short); } if (!( $adir =~ /^[a-z]{1}\d{4}\_\d{7}\_\d{7}(\_\d{4}(\_\d{3})?)?(\.\d{1})?\.\w{3}$/)){ push (@badform, $adir); } # pull off the last set of numbers before the extension elsif ($parent =~ /Scans/i || (!$parent)){ next;} # don't check non-compound objects for sequence } } else { push (@wrongdir, $adir." ".$short);} } close(DIR); } $count = 1; @sorteditems = sort by_number(@itemnums); foreach $mynum (@sorteditems){ if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "item level --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "item level --> $count is missing");} } $count = $mynum + 1; } @itemmoms = sort by_number(keys (%itempages)); foreach $item (@itemmoms){ $count = 1; @pages = @{$itempages{$item}}; # $numpages = scalar (@pages); # print "$item has $numpages before and"; # @sorted = sort by_number(@pages); # $numpages = scalar (@sorted); # print "$numpages after\n"; foreach $mynum (@sorted){ if (! $mynum){ print "no page number for something in $item\n";} if ($mynum != $count){ $diff = $mynum - $count; if ($diff < 0){ print "error: $mynum page in $item item, expected count is $count\n";} $lastmissing = $mynum -1; if ($lastmissing != $count){ push (@badcount, "pages of $item --> $diff missing files: $count through $lastmissing"); } else{ push (@badcount, "pages of $item --> $count is missing");} } $count = $mynum + 1; } undef @{$itempages{$item}}; } undef @itemnums; undef @itempages; if (@badform || @badcount || @wrongdir || @missing ){ print OUT "\nTROUBLE: $topdir \n"; print OUT "-------------------------------------------\n"; } else{ print OUT "All is GREAT in $topdir! :-) \n"; } if (@missing){ print OUT "\nThe following files are missing or badly named \n"; foreach (@missing){ print OUT " $_\n";} } if (@wrongdir){ print OUT "\nThe following files or directories do NOT reflect the name of their parent directory\n"; print OUT "Are they in the right place? Please check:\n"; foreach (@wrongdir){ print OUT " $_\n";} } if (@badform){ print OUT "\nThe following filenames or directories are not in the correct format:\n"; foreach (@badform){ print OUT " $_\n";} } if (@badcount){ print OUT "\nSome files seem to be missing, according to sequence gaps\n"; foreach (@badcount){ print OUT " $_\n";} } undef @badform; undef @wrongdir; undef @badcount; undef @missing } # do the next top directory close OUT; print "Good bye!\n"; exit; sub timestamp{ #following for Windows #print "hit enter twice please\n"; #$date = `date`; #$time = `time`; #print $date."\n"; #if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ # $date = $3.$1.$2; # } #print $date."\n"; #if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ # $time = $1.$2.$3; # } #print $time."\n"; #$timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); ##$year += 1900; $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; } sub by_number {$a <=> $b;}