#!/usr/bin/perl use Time::Local; use DBI; # acumenCount # checks through the directories in /srv/www/htdocs/content and does a count # of collections, item level and page level files # jody DeRidder updated 4/2/10 ## Copyright (c) 2010, The University of Alabama Libraries. ## Contributed by Jody DeRidder, 6/10/10. ## All rights reserved. ## Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: ## * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. ## * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the distribution. ## * Neither the name of The University of Alabama Libraries nor the names of its contributors may be used to endorse or promote products ## derived from this software without specific prior written permission. ##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ##THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ##CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ##PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ##LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ##EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $hostname = "localhost"; $port = "3306"; $user = "user"; $password = "password"; $database = "InfoTrack"; $dbh = DBI->connect("DBI:mysql:$database:$hostname:$port", $user, $password) or die "can't connect to database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; $base = "/srv/www/htdocs/content/"; ×tamp; $out = "./output/AcumenCollCount_".$timestamp.".txt"; open (OUT, ">".$out) or die "can't open $out\n"; print OUT "Coll ID\tColl Title\tItem Count\tFile Count\tHas EAD (1=yes)\n"; opendir(BASE, $base) or die "can't look through $base\n"; while ($file = readdir(BASE)){ if ($file =~ /^\./){ next;} elsif ($file =~ /^[a-z]{1}\d{4}$/){ push (@topdirs, $base.$file); } } close(BASE); # collection level foreach $dir (@topdirs){ opendir(IN, $dir) or die "can't look through $dir\n"; while ($file = readdir(IN)){ if ($file =~ /^\./ || $file =~ /Documentation/ || $file =~ /Metadata/){ next;} push (@colldirs, $dir."/".$file); } close(IN); } foreach $colld (@colldirs){ ($short = $colld) =~ s,$base,,; # take off base $short =~ s,\/,\_,g; #replace / with _ $short =~ s,__+,_,g; # clean up doubles -- now $short should match collnum push (@collnums, $short); } $numcolls = scalar (@collnums); print "we have $numcolls collections represented in Acumen.\n"; #$out2 = "> ./collListAcumen"; #open (OUT2, $out2) or die "can't open $out2\n"; @sorted = sort (@collnums); foreach $coll (@sorted){ # print OUT2 "$coll\n"; undef @mydirs; $mycount = 0; undef $title; ($mydir = $coll) =~ s,\_,\/,g; # reconstruct directory from name $mydir = $base."/".$mydir."/"; $id = $dbh->quote($coll); $d = $dbh->quote("D"); $a = $dbh->quote("A"); $sth = $dbh->prepare("select title from allColls where id_2009 = $id and analogOrDigital = $d") or die "can't prepare select for $coll: ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't select for $coll : ", $sth->errstr(),"\n"; $title = $sth->fetchrow_array(); $sth->finish(); if (!$title){ $sth = $dbh->prepare("select title from allColls where id_2009 = $id and analogOrDigital = $a") or die "can't prepare select for $coll: ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't select for $coll : ", $sth->errstr(),"\n"; $title = $sth->fetchrow_array(); $sth->finish(); } opendir(COLL, $mydir) or die "can't open $mydir\n"; while ($file = readdir(COLL)){ # looking through the collection directory # if ($file =~ /^\./|| $file =~ /Documentation/ || $file =~ /Metadata/){ next;} if ($file =~ /^\./ || $file =~ /Documentation/){ next;} elsif ($file =~ /Metadata/){ $path = $mydir."/".$file; opendir (MD, $path) or die "can't look in $path\n"; undef $found; while ($afile = readdir(MD)){ if ($afile =~ /^(.*?)\.ead\.xml/){ push (@eads, $1); $found = 1; } } if (!$found){ push (@noEads, $mydir);} close (MD); next; } } push (@mydirs, $mydir); undef @items; undef @files; undef @filedirs; # use this for getting file count. use mydirs for getting item count push (@filedirs, $mydir); foreach $dir (@mydirs){ opendir(DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /Metadata/ || $file =~ /Documentation/ || $file =~ /\_512\.jpg/ || $file =~ /\_128\.jpg/ || $file =~ /\.ocr\.txt/ ){ next;} # skip these $path = $dir.$file; # print " looking at $path\n"; if (-d $path){ if ($path =~ /Transcripts/){ next; } ($id = $path) =~ s,\/srv\/www\/htdocs\/content\/,,; $id =~ s,\/,\_,g; $id =~ s,\_$,,; $id =~ s,^\_,,; $mods = $path."/Metadata/".$id.".mods.xml"; # print " looking for MODS for $id in $mods\n"; if (-e $mods){ push (@items, $file); next; # do NOT go into lower directories } # elsif (!($file =~ /^\d{3}$/)){print "no MODS for $id at $mods\n";} push (@mydirs, $path."/"); } elsif ($file =~ /\.txt/ || $file =~ /\_2048\.jpg/ || $file =~ /\.pdf/ || $file =~ /\.mp3/){ push (@files, $file); } } close(DIR); } # now for the filecount foreach $dir (@filedirs){ opendir(DIR, $dir) or die "can't look through $dir\n"; while ($file = readdir(DIR)){ if ($file =~ /^\./ || $file =~ /Metadata/ || $file =~ /Documentation/ || $file =~ /\_512\.jpg/ || $file =~ /\_128\.jpg/ || $file =~ /\.ocr\.txt/ ){ next;} # skip these $path = $dir.$file; # print " looking at $path\n"; if (-d $path){ push (@filedirs, $path."/"); } elsif ($file =~ /\.txt/ || $file =~ /\_2048\.jpg/ || $file =~ /\.pdf/ || $file =~ /\.mp3/){ push (@files, $file); } else{ print "what is this? $path\n";} } close(DIR); } $itemcount = scalar (@items); $filecount = scalar (@files); # if ($coll =~ /u0003\_0000252/){ # open (OUT2, ">./cabanissFiles") or die "can't write to cabanissFiles\n"; # foreach (@files){ print OUT2 $_."\n";} # close(OUT2); # } $totalItems += $itemcount; $totalFiles += $filecount; print OUT "$coll\t$title\t$itemcount\t$filecount\t$found\n"; } $numEADs = scalar (@eads); print OUT "\n\nTotal items: $totalItems\nTotal files: $totalFiles\nNumber of collections: $numcolls\n"; print OUT "Number of EADs: $numEADs\n"; $numNoEADs = scalar (@noEads); print "Number of collections with no EAD: $numNoEADs\n\n"; print "\n\nLook in $out to find counts by collection in Acumen. \n Open in Excel with tabs as delimiters.\n\n"; close(OUT); #close(OUT2); sub by_number {$a <=> $b;} sub timestamp{ #following for Windows #print "hit enter twice please\n"; #$date = `date`; #$time = `time`; #print $date."\n"; #if ($date =~ /.*? (\d*)\/(\d*)\/(\d*)/){ # $date = $3.$1.$2; # } #print $date."\n"; #if ($time =~ /.*? (\d*)\:(\d*)\:(\d*)\./){ # $time = $1.$2.$3; # } #print $time."\n"; #$timestamp = $date."T".$time; #print $timestamp."\n"; # following for unix ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = gmtime(); ##$year += 1900; $mon ++; if ($mon < 10){ $mon="0".$mon;} #need 2 digits if ($sec < 10){ $sec="0".$sec;} if ($min < 10){ $min="0".$min;} if ($hour < 10){ $hour="0".$hour;} if ($mday < 10){ $mday="0".$mday;} $year = $year + 1900; $timestamp= "$year-$mon-$mday\T$hour:$min:$sec\Z"; }