Ok, this one is boring, with lots of repeats (sorry). Not only were the threads changed with a perl update (see thread-boss-worker-updated), but the changes were enough to screw up one of my 'threads->create' calls. Oh well, it's been done, and here's what's left:
#!/usr/bin/perl
#
# scrubsitemap.pl
#
# Copyright 2007-2008 Joshua Radke (josh at radkeland dot org)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.
#
# This program is the direct result of the blog entry at
# http://www.radkeland.org/project-treadmill-continues-cleaning-your-sitem... Here's the general outline.
#
# Read the sitemap generation configuration file (provided from a global variable).
# Read the sitemap file.
# Loop through the sitemap file. For each URL, make certain the
# server responds.
# Read the access logs, extracting URL's.
# Now, loop though the URL's in the access logs. For each:
# If it matches an exclude rule from the sitemap config, move on.
# (Increment counter on sitemap config)
# Finally, loop through the URL's from the sitemap config. For each
# one that has not been incremented, print out a message to delete the
# 'stale' rule.
use strict;
use threads;
use Thread::Semaphore;
use IO::All;
# For threading
my $maxthreads = 4; # Careful with this, you can swamp a server ...
my $sem = new Thread::Semaphore($maxthreads);
my $sitemapfn = '/var/www/radkeland.org/sitemap.xml.gz';
my $smconfig = '/home/josh/bin/sitemap_gen-1.4/radkeland.org_config.xml';
my @loglist = qw ( /var/log/httpd/access_log.4 /var/log/httpd/access_log.3
/var/log/httpd/access_log.2 /var/log/httpd/access_log.1
/var/log/httpd/access_log);
# Some patterns we'll want to leave in the sitemap config file. To
# keep them from showing up in our 'abandoned' report, add them here.
my @keeppats = qw < /\.[^/]* /\?q=admin org/cron.php org/favicon.ico
/icons/ /images/ /\?q=user/.*/edit
/\?q=user/ /ie\.css /index\.php\?page=http://
/rldefault\.css /rss.xml /navbar\.php
/(node|user|home|taxonomy/term)$ /filter/tips$ >;
my @smurls;
open( SITEMAP, "zcat $sitemapfn |" )
or die "Unable to open sitemap: $sitemapfn";
while (<SITEMAP>) {
my $line = $_;
if ( $line =~ m/^\s+<loc>(.*?)<\/loc>/ ) {
my $url = $1;
chomp $url;
push( @smurls, $url );
}
}
close(SITEMAP);
# There's our sitemap, let's check it with wget. We'll print out the
# lines that need to go into the sitemap exception file.
my $errcount = 0;
for my $url (@smurls) {
threads->create( \&checkurl, $url ) or
die "Unable to create thread to check url $url";
cleanthreads( 0, \&printifbad );
}
# Get the rest of our threads:
cleanthreads( 1, \&printifbad );
unless ( $errcount == 0 ) {
print "--------------------- End Errors ---------------------\n";
print "$errcount urls invalid\n";
} else {
print "No bad URLs in sitemap!\n";
}
# In part 2, we look through the log files and determine which patterns
# are no longer being used. First, let's get our patterns.
my @patternlist;
my @unusedlist;
my @filecontents;
@filecontents = io($smconfig)->chomp->slurp;
for my $line (@filecontents) {
if (
$line =~ m/<filter\s+action="drop"\s+type="regexp"\s+
pattern="([^\"]+)"\s*\/>/x
)
{
my $thispat = $1;
# I've used org at the beginning of my patterns (to distinguish
# between websites, but the access logs omit the initial part ... strip
# if necessary.
$thispat =~ s/^org(\/.*)$/$1/;
for my $kpat (@keeppats) {
if ( $kpat eq $thispat ) { $thispat = undef; last }
}
if ( defined($thispat) ) { push( @patternlist, $thispat ); }
}
}
# We have our patterns, now let's do the rest of the checking. We'll
# look through each of the log files. If a pattern doesn't match
# somewhere in the whole file, we'll move it to the unused list.
# After checking a file, we copy the unused list back to the pattern
# list, and repeat. After checking all files, anything left will be
# truly unused.
print "Pattern list starts with " . ( $#patternlist + 1 ) . " entries\n";
for my $fn (@loglist) {
@unusedlist = ();
my $fullfile = io($fn)->chomp->slurp;
# Now loop through patterns
for my $pat (@patternlist) {
if ( $fullfile !~ m/$pat/m ) {
push( @unusedlist, $pat );
}
}
@patternlist = @unusedlist;
}
print "---- Unused patterns (can be removed from $smconfig ----\n";
@unusedlist = ();
print join( "\n", @patternlist ) . "\n";
exit(0);
# Taken from threadbossworker.pl, see
# http://www.radkeland.org/boss-worker-threaded-model-implemented-perl
# for details.
sub cleanthreads {
my $mode = shift @_;
my $func = shift @_;
my @othervals = ( defined( $_[0] ) ) ? (@_) : (undef);
my @thrlist;
@thrlist = threads->list();
# And do the deed - oddly, the parent doesn't seem to get included in
# @thrlist when generated as above. Keep an eye on this.
foreach my $thr (@thrlist) {
&$func( $thr->join(), @othervals );
}
return;
}
# This is a threaded function to check a url.
sub checkurl {
$sem->down;
my $url = shift;
# Grrr ... the injection attacks use bizzare characters ... we'll need
# to quote them for wget to work properly.
my $thisurl = quotemeta($url);
my $response = `wget --spider $thisurl 2>&1`;
my $retval;
if ( $response =~ /ERROR/ ) {
$retval =
" <filter action=\"drop\" type=\"regexp\" pattern=\"$thisurl\" />\n";
} else {
$retval = undef;
}
$sem->up;
return $retval;
}
# This routine prints the bad line (if there is one), otherwise, it
# does nothing. It is also responsible for printing the header and
# updating the global $errcount.
sub printifbad {
my $out = shift;
unless ( defined($out) ) { return }
print "-------------------- Begin Errors --------------------\n"
unless $errcount;
$errcount++;
print $out;
return;
}