Scrubsitemap Updated

Ok, this one is boring, with lots of repeats (sorry). Not only were the threads changed with a perl update (see thread-boss-worker-updated), but the changes were enough to screw up one of my 'threads->create' calls. Oh well, it's been done, and here's what's left:

#!/usr/bin/perl
#
# scrubsitemap.pl
#
# Copyright 2007-2008 Joshua Radke (josh at radkeland dot org)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.
#
# This program is the direct result of the blog entry at
# http://www.radkeland.org/project-treadmill-continues-cleaning-your-sitem... Here's the general outline.
#
# Read the sitemap generation configuration file (provided from a global variable).
# Read the sitemap file.
# Loop through the sitemap file.  For each URL, make certain the
# server responds.
# Read the access logs, extracting URL's.
# Now, loop though the URL's in the access logs.  For each:
#    If it matches an exclude rule from the sitemap config, move on.
#    (Increment counter on sitemap config)
# Finally, loop through the URL's from the sitemap config.  For each
# one that has not been incremented, print out a message to delete the
# 'stale' rule.
use strict;
use threads;
use Thread::Semaphore;
use IO::All;

# For threading
my $maxthreads = 4;    # Careful with this, you can swamp a server ...
my $sem       = new Thread::Semaphore($maxthreads);
my $sitemapfn = '/var/www/radkeland.org/sitemap.xml.gz';
my $smconfig  = '/home/josh/bin/sitemap_gen-1.4/radkeland.org_config.xml';
my @loglist   = qw ( /var/log/httpd/access_log.4 /var/log/httpd/access_log.3
  /var/log/httpd/access_log.2 /var/log/httpd/access_log.1
  /var/log/httpd/access_log);

# Some patterns we'll want to leave in the sitemap config file.  To
# keep them from showing up in our 'abandoned' report, add them here.
my @keeppats = qw < /\.[^/]* /\?q=admin org/cron.php org/favicon.ico
  /icons/ /images/ /\?q=user/.*/edit
  /\?q=user/ /ie\.css /index\.php\?page=http://
  /rldefault\.css /rss.xml /navbar\.php
  /(node|user|home|taxonomy/term)$ /filter/tips$ >;
my @smurls;
open( SITEMAP, "zcat $sitemapfn |" )
  or die "Unable to open sitemap: $sitemapfn";

while (<SITEMAP>) {
my $line = $_;
if ( $line =~ m/^\s+<loc>(.*?)<\/loc>/ ) {
  my $url = $1;
  chomp $url;
  push( @smurls, $url );
}
}
close(SITEMAP);

# There's our sitemap, let's check it with wget.  We'll print out the
# lines that need to go into the sitemap exception file.
my $errcount = 0;
for my $url (@smurls) {
threads->create( \&checkurl, $url ) or
   die "Unable to create thread to check url $url";
cleanthreads( 0, \&printifbad );
}

# Get the rest of our threads:
cleanthreads( 1, \&printifbad );
unless ( $errcount == 0 ) {
print "--------------------- End Errors ---------------------\n";
print "$errcount urls invalid\n";
} else {
print "No bad URLs in sitemap!\n";
}

# In part 2, we look through the log files and determine which patterns
# are no longer being used.  First, let's get our patterns.
my @patternlist;
my @unusedlist;
my @filecontents;
@filecontents = io($smconfig)->chomp->slurp;
for my $line (@filecontents) {
if (
  $line =~ m/<filter\s+action="drop"\s+type="regexp"\s+
                 pattern="([^\"]+)"\s*\/>/x
   )
{
  my $thispat = $1;

  # I've used org at the beginning of my patterns (to distinguish
  # between websites, but the access logs omit the initial part ... strip
  # if necessary.
  $thispat =~ s/^org(\/.*)$/$1/;
  for my $kpat (@keeppats) {
   if ( $kpat eq $thispat ) { $thispat = undef; last }
  }
  if ( defined($thispat) ) { push( @patternlist, $thispat ); }
}
}

# We have our patterns, now let's do the rest of the checking.  We'll
# look through each of the log files.  If a pattern doesn't match
# somewhere in the whole file, we'll move it to the unused list.
# After checking a file, we copy the unused list back to the pattern
# list, and repeat.  After checking all files, anything left will be
# truly unused.
print "Pattern list starts with " . ( $#patternlist + 1 ) . " entries\n";
for my $fn (@loglist) {
@unusedlist = ();
my $fullfile = io($fn)->chomp->slurp;

# Now loop through patterns
for my $pat (@patternlist) {
  if ( $fullfile !~ m/$pat/m ) {
   push( @unusedlist, $pat );
  }
}
@patternlist = @unusedlist;
}
print "---- Unused patterns (can be removed from $smconfig ----\n";
@unusedlist = ();
print join( "\n", @patternlist ) . "\n";
exit(0);

# Taken from threadbossworker.pl, see
# http://www.radkeland.org/boss-worker-threaded-model-implemented-perl
#
for details.
sub cleanthreads {
my $mode      = shift @_;
my $func      = shift @_;
my @othervals = ( defined( $_[0] ) ) ? (@_) : (undef);
my @thrlist;

  @thrlist = threads->list();

# And do the deed - oddly, the parent doesn't seem to get included in
# @thrlist when generated as above.  Keep an eye on this.
foreach my $thr (@thrlist) {
  &$func( $thr->join(), @othervals );
}
return;
}

# This is a threaded function to check a url.
sub checkurl {
$sem->down;
my $url = shift;

# Grrr ... the injection attacks use bizzare characters ... we'll need
# to quote them for wget to work properly.
my $thisurl  = quotemeta($url);
my $response = `wget --spider $thisurl 2>&1`;
my $retval;
if ( $response =~ /ERROR/ ) {
  $retval =
    "  <filter  action=\"drop\" type=\"regexp\" pattern=\"$thisurl\" />\n";
} else {
  $retval = undef;
}
$sem->up;
return $retval;
}

# This routine prints the bad line (if there is one), otherwise, it
# does nothing.  It is also responsible for printing the header and
# updating the global $errcount.
sub printifbad {
my $out = shift;
unless ( defined($out) ) { return }
print "-------------------- Begin Errors --------------------\n"
   unless $errcount;
$errcount++;
print $out;
return;
}