Cleaning up your Sitemap - Done

Well, after a long and wonderful family weekend (Easter), and about 20 minutes this morning to tie up loose ends, the mini-project I wrote about a few days ago is finished. All of the configuration (there's not much) is in beginning, and stored in global variables, so it should be fairly easy to adapt it to your own usage. There's nothing like looking at the real thing, so here it is: I hope some can find this useful!

Before the code, I wanted to comment on editor of choice for the (simple) work I've been doing, and what I should be using moving forward. I've used vi and emacs for years, but I keep hearing about eclipse (over and over and over again!). I never given a serious effort towards using it, but for the sake of not falling into obsolescense, I'll give it a go.

#!/usr/bin/perl
#
# scrubsitemap.pl
#
# Copyright 2007 Joshua Radke (josh at radkeland dot org)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License (version 2) as
# published by the Free Software Foundation.
#
# This program is the direct result of the blog entry at
# http://www.radkeland.org/project-treadmill-continues-cleaning-your-sitem... Here's the general outline.
#
# Read the sitemap generation configuration file (provided from a global variable).
# Read the sitemap file.
# Loop through the sitemap file.  For each URL, make certain the
# server responds.
# Read the access logs, extracting URL's.
# Now, loop though the URL's in the access logs.  For each:
#    If it matches an exclude rule from the sitemap config, move on.
#    (Increment counter on sitemap config)
# Finally, loop through the URL's from the sitemap config.  For each
# one that has not been incremented, print out a message to delete the
# 'stale' rule.

use strict;
use threads;
use Thread::Semaphore;
use IO::All;

sub cleanthreads($$@);
sub printifbad($);
sub checkurl($);

# For threading
my $maxthreads = 4; # Careful with this, you can swamp a server ...
my $sem = new Thread::Semaphore($maxthreads);

my $sitemapfn = '/var/www/radkeland.org/sitemap.xml.gz';
my $smconfig = '/home/josh/bin/sitemap_gen-1.4/radkeland.org_config.xml';
my @loglist = qw ( /var/log/httpd/access_log.4 /var/log/httpd/access_log.3
                   /var/log/httpd/access_log.2 /var/log/httpd/access_log.1
                   /var/log/httpd/access_log);
# Some patterns we'll want to leave in the sitemap config file.  To
# keep them from showing up in our 'abandoned' report, add them here.
my @keeppats = qw < /\.[^/]* /\?q=admin org/cron.php org/favicon.ico
                    /icons/ /images/ /\?q=user/.*/edit
                    /\?q=user/ /ie\.css /index\.php\?page=http://
                    /rldefault\.css /rss.xml /navbar\.php
                    /(node|user|home|taxonomy/term)$ /filter/tips$ >;

my @smurls;

open(SITEMAP, "zcat $sitemapfn |") or die "Unable to open sitemap: $sitemapfn";

while (<SITEMAP>) {
  my $line = $_;
  if ($line =~ m/^\s+<loc>(.*?)<\/loc>/ ) {
    my $url = $1;
    chomp $url;
    push(@smurls, $url);
  }
}

close(SITEMAP);

# There's our sitemap, let's check it with wget.  We'll print out the
# lines that need to go into the sitemap exception file.

my $errcount = 0;
for my $url (@smurls) {
  threads->create({'context' => 'scalar'}, \&checkurl, $url);
  cleanthreads(0, \&printifbad);
}

# Get the rest of our threads:
cleanthreads(1, \&printifbad);

unless ($errcount == 0) {
  print "--------------------- End Errors ---------------------\n";
  print "$errcount urls invalid\n";
} else {
  print "No bad URLs in sitemap!\n";
}

# In part 2, we look through the log files and determine which patterns
# are no longer being used.  First, let's get our patterns.
my @patternlist;
my @unusedlist;
my @filecontents;

@filecontents = io($smconfig)->chomp->slurp;
for my $line (@filecontents) {
  if ($line =~ m/<filter\s+action="drop"\s+type="regexp"\s+
                 pattern="([^\"]+)"\s*\/>/x ) {
    my $thispat = $1;
    # I've used org at the beginning of my patterns (to distinguish
    # between websites, but the access logs omit the initial part ... strip
    # if necessary.
    $thispat =~ s/^org(\/.*)$/$1/;
    for my $kpat (@keeppats) {
      if ( $kpat eq $thispat) { $thispat = undef; last }
    }
    if (defined($thispat)) { push(@patternlist, $thispat); }
  }
}

# We have our patterns, now let's do the rest of the checking.  We'll
# look through each of the log files.  If a pattern doesn't match
# somewhere in the whole file, we'll move it to the unused list.
# After checking a file, we copy the unused list back to the pattern
# list, and repeat.  After checking all files, anything left will be
# truly unused.

print "Pattern list starts with " . ($#patternlist + 1) . " entries\n";

for my $fn (@loglist) {
  @unusedlist = ();
  my $fullfile = io($fn)->chomp->slurp;
  # Now loop through patterns
  for my $pat (@patternlist) {
    if ($fullfile !~ m/$pat/m) {
      push(@unusedlist, $pat);
    }
  }
  @patternlist = @unusedlist;
}

print "---- Unused patterns (can be removed from $smconfig ----\n";
@unusedlist = ();
print join("\n", @patternlist) . "\n";

exit(0);

# Taken from threadbossworker.pl, see
# http://www.radkeland.org/boss-worker-threaded-model-implemented-perl
#
for details.  (make sure to scan forward for newer versions)
sub cleanthreads($$@) {
  my $mode = shift @_;
  my $func = shift @_;
  my @othervals = (defined($_[0])) ? (@_) : (undef);
  my @thrlist;

  # First, get our list of threads to join.
  if ($mode == 0) {
    # We will guarantee that at least one thread is joinable or we are
    # not at maximum capacity by decrementing the semaphore (then
    # incrementing it).
    $sem->down;
    $sem->up;
    @thrlist = threads->list(threads::joinable);
  } elsif ($mode == 1) {
    @thrlist = threads->list(threads::all);
  } else {
    die "Value $mode for \$mode not allowed in cleanthreads";
  }

  # And do the deed - oddly, the parent doesn't seem to get included in
  # @thrlist when generated as above.  Keep an eye on this.
  foreach my $thr (@thrlist) {
    &$func($thr->join(), @othervals);
  }
  return;
}

# This is a threaded function to check a url.
sub checkurl($) {
  $sem->down;
  my $url = shift;

  # Grrr ... the injection attacks use bizzare characters ... we'll need
  # to quote them for wget to work properly.
  my $thisurl = quotemeta($url);
  my $response = `wget --spider $thisurl 2>&1`;
  my $retval;
  if ($response =~ /ERROR/) {
    $retval = "  <filter  action=\"drop\" type=\"regexp\" pattern=\"$thisurl\" />\n";
  } else {
    $retval = undef;
  }
  $sem->up;
  return $retval;
}

# This routine prints the bad line (if there is one), otherwise, it
# does nothing.  It is also responsible for printing the header and
# updating the global $errcount.
sub printifbad($) {
  my $out = shift;

  unless (defined($out)) { return }

  print "-------------------- Begin Errors --------------------\n"
    unless $errcount;
  $errcount++;

  print $out;
  return;
}