runWebalizer is used to combine logfiles from mutliple apache servers and submit them to the Webalizer logfile analysis program. This is great if your running load-balanced Apache servers.

The trick to running runWebalizer is to get all your log files in the same directory. This can be done via NFS or ssh.

I really need to re-write this program to use more standardized methods. Some people may find it useful, but it really isn't effective on apache servers running in today's environment.

It replicates what logrotate does and forces all apache servers to rotate the logs at the same time. This was okay in 1997 with two web servers, but it doesn't scale well now. If I needed to do this today, I would probably just log all the web server data to a database using syslog-ng or log to a shared flatfile using a syslog server. I'm leaving the code here in case anybody is interested, but I wouldn't recommend using it. Go to O'Reily instead.

#!/usr/bin/perl -w
#
# runWebalizer: Webalizer redundant server script.
#     Written by: Joel Griffiths joelg at the domain gadgetwiz dot com
#
# Concantenates and rotates log files on redundant virtual web servers
# in unison with webalizer updates.
#
# Copyright (C) 1997 Aver Drivetronics
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# See COPYING.TXT for details.
# 
# Original source code can be found at 
# http://www.averdrivetronics.com/webalizer
#
# This program is meant to be run by a cronjob.
# Web server log files should be NFS mounted to the directory
# specified by $WEBLOGS.
# 
# Example VirtualHost directive in httpd.conf
#<VirtualHost *>
#    ServerAdmin webmasterataverdrivetronics.com
#    DocumentRoot /home/drivetronics/public_html
#    ServerName www.averdrivetronics.com
#    ServerAlias averdrivetronics.com www.averdrivetronics.com
#    LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \
#      \"%{User-Agent}i\"" combined
#    CustomLog logs/custom_averdrivetronics.com.log combined
#</VirtualHost>
#
# Log files for each virtual host should be stored as
# custom_DOMAINNAME.log. The DOMAINNAME is extracted
# from the logfile name and used to create a seperate
# sub-directory for each domain.
# 
# You can configure a different logfile pattern matching
# mechanism below by changing $LOG_HEADER and $LOG_FOOTER below.
# By default:
#     $LOG_HEADER="custom_";
#     $LOG_FOOTER="\.log";
#

use strict;
use warnings;
use Fcntl ':mode';
use Data::Dumper;

#############################################################
# USER CONFIG VARIABLES
#############################################################
# Send Debugging Information - Set to 1 to turn on messages
my $DEBUG = 0;

# Enable Logfile Rotation
my $ROTATE_LOGS = 1;

# The number of logfiles to keep
my $MAXLOGS = 4; # Sets the number of logfiles to keep

# The names of your webserver NFS mounted directories
# ie. mount www1.averdrivetronics.com:/var/log/httpd /weblogs/www1
#
# Single Web Server
#my @SERVERS = ( ''');
#
# Redundant servers (Add as many servers as required)
my @SERVERS = ( 'www1', 'www2');

# Logfile configuration information
# Probably would have been better with a regular expression.
# This applies to how the log files are stored. I store my
# logfiles as custom_DOMAINNAME.log. DOMAINNAME is extracted
# and is used to create a directory for each virtual domain.
my $LOG_HEADER = "custom_";
my $LOG_FOOTER = "\.log";

# Rotate the logfile after if reaches a certain size
my $ROTATE_SIZE = 1024;

# Logfile locations
# Single Web Server
#
# my $WEBLOG_DIR = "/var/log/httpd"
#
# NFS mounted directory for for logfiles (redundant web servers)
my $WEBLOG_DIR = "/weblogs";

# Where to store the stats after rotating the logfiles
my $STATS_DIR = "$WEBLOG_DIR/stats";

# Where is the webalizer script
my $WEBALIZER = "/usr/bin/webalizer";

# I need the cat command or the LogSorter command to concatenate the
# logfiles. The LogSorter, available at http://ostermiller.org/webalizer/
# works best for me, but I default to cat to make things easier.
#my $CAT = "/usr/java/j2sdk1.4.0/bin/java -classpath " .
#      "/usr/java/apps LogSorter";
my $CAT = "/bin/cat";

# Better make sure that the ssh command can get to each web server after
# rotating the logs. Got to restart the servers. Don't worry about this
# if you have log rotation turned off.
#
# Remember to add this system to the /root/.ssh/autorized_keys file
# on the remote systems if you want to rotate remote logs
my $SSH = "/usr/bin/ssh";

# Command required to restart the httpd service.
my $RESTART = "/sbin/service httpd reload";

#############################################################
# EDIT BELOW HERE AT YOUR OWN RISK
#############################################################

my $maxLogRange = $MAXLOGS;
my $restartFlag = 0;

sub dprint {
   if($DEBUG == 0) {
      return;
   }
   print @_;
}

# getDomains
#
# Parse domains from the log directories of several
# NFS mounted web servers. The domains are represented
# by a custom_domain.com.log logfile.
#
# The return value is a hash of arrays:
#
# $VAR1 = 'avercomputer.com';
# $VAR2 = [
#        '/weblogs/www1/custom_avercomputer.com.log',
#        '/weblogs/www2/custom_avercomputer.com.log'
# ];
#
sub getDomains() {
   my %domains = ();
   my $server = ""; # Used to store current server name
   my $filename = ""; ## Filename to test
   my $lognumber = 0; ## Rotated lognumber

   foreach $server (@SERVERS) {
      dprint "WORKING ON SERVER $server\n";
      opendir(MYDIR, "$WEBLOG_DIR/$server");
      while( $filename = readdir(MYDIR) ) {
         # Check filename for custom_domain.subdomain.log

         $_ = $filename;

         # Delete any extra archived log files
         # Probably doesn't belong here, but is faster
         # because I don't have to loop through an
         # opendir again.
         if( m{(($LOG_HEADER.*$LOG_FOOTER)\.(\d+))$} ) {
            $maxLogRange = $3 > $maxLogRange ? $3 : $maxLogRange;
            if($3 >= $MAXLOGS) {
               unlink("$WEBLOG_DIR/$server/$filename");
            }
         # This is not an archived log file
         # so, store it in the hash[x].
         } else {
            if(s/$LOG_HEADER(.*\..*)$LOG_FOOTER/$1/ ) {
               dprint "Logfile $filename found on $server for the $1 domain\n";
               push(@{%domains->{$1}}, "$WEBLOG_DIR/$server/$filename");
            } else {
               #dprint "$filename is NOT A DOMAIN FILE\n";
            }
         }
      }
      close(MYDIR);
   }

   dprint Dumper(%domains);
   return %domains;

}

# checkdir
#
# Takes an array of domains and checks the directory
# structure for the output directory. If the directory
# does not yet exist for the domain, create the directory.
#
sub checkdir {
   my @domains = @_;
   my $dirStatus = 0;
   
   my $outputDir = "";
   my $domain = "";
   for $domain (@domains) {
      $outputDir = $STATS_DIR . "/" . $domain;

      # We must have write access to the stats directory
      # so that we can add new domains automagically.
      unless (-w $STATS_DIR) {
         die "Cannot write to stats directory: $STATS_DIR";
      }

      # Create the directory if it doesn't exist
      if (! -e $outputDir ) {
         dprint "Making directory $outputDir\n";
         mkdir($outputDir, 0755) ||
            die "Cannot create directory $outputDir";
      } else {
         dprint "Directory $outputDir already exists\n";
      }

      # Now check any existing directories (as a side-effect,
      # I get to check the one I just created too) to see
      # if they are writable by this program...
      #
      # In a perfect world, I would check all the files
      # in the directories too. TODO
      if (! -d $outputDir) {
         die "$outputDir is not a directory";
      }

      if (! -r $outputDir || ! -w $outputDir || ! -x  $outputDir) {
         die "$outputDir is not a directory";
      }

   }
}

# runWebalizer
#
# Run the weblizer utility
#
# Takes a hash of arrays where the key is the domain name
# and the array at that key contains the logfile location.
#
# cat www1/custom_averdrivetronics.log www2/custom_averdrivetronics.log | \
# webalizer -D /weblogs/stats/dns.cache -o /weblogs/stats/averdrivetronics
#
sub runWebalizer {
   my %domains = @_; # Take one or more domains
   my $logfile = "Logfile will go here";

   dprint Dumper(%domains);

   my @domains; # KEEP PERL HAPPY
   my $outputDir = "";
   my $domain = "";

   dprint "@domains";
   my $command = "Webalizer command will go here.";
   my $output = "Output will go here.";

   foreach $domain (keys %domains) {
      dprint "Working on domain for $domain\n";
      $outputDir = $STATS_DIR . "/" . $domain;

      # Cat logfiles from all servers into a temporary file
      my $command = $CAT;
      foreach $logfile ( @{%domains->{$domain}}) {
         $command .= " $logfile ";
      }

      $command .= " > $outputDir/combinedLogs.log";
      dprint "$command\n";
      $output = qx/$command/;

      # Run the webalizer on the combined log file
      $command = "$WEBALIZER -n \"$domain\" -r \"$domain\" " .
         "-s \"\*$domain\" -t \"$domain\" -N 5 " .
         "-D $outputDir/dns.cache -o $outputDir $outputDir/combinedLogs.log";

      $output = qx/$command/;
      dprint "\n-----Output for Command:\n\t$command\n";
      dprint "$output\n";
      dprint "\n--------------------------------------\n";
   }

   return;

}

sub checkWE {
   my $filename = @_;
}

sub deleteFile {
   my $filename = @_;

   dprint "Deleting file $filename";
}

sub rotateLogs {
   my %domains = @_;
   my $lognumber;

   my $logfile = "";
   my $domain = "";

   my $command = "Copy command goes here";
   my $output = "Command's output goes here.";

   my @filestat;
   my $result;

   # Remove the extra logfiles before rotating the logs.
   # The maxLogRange wa set during the checkdir function
   # and is not being set here, because it would require
   # the opening of another readdir...
   foreach $domain (keys %domains) {
      dprint "--- Rotating logfiles for $domain ---\n";
      foreach $logfile (@{%domains->{$domain}}) {

         # Rotate the file unless it is new
         @filestat = stat($logfile);
         #$result = (time - $filestat[10]) / 86400;
         #dprint " Days since Inode Change Time for $logfile: $result\n";
         $result = $filestat[7] / 1024;
         dprint " Logfile size for $logfile: $result\n";
         next if( $result <= $ROTATE_SIZE);

         # Set flag to restart httpd if logfile is rotated
         $restartFlag = 1;

         # Remove any out of range log files
         $command = "rm -f $logfile\[$MAXLOGS\-$maxLogRange\]";
         $output = qx/$command/;
         dprint "$command\n";
         dprint "$output\n";

         # Now go through each log file and copy it to
         # the next higher log file (eg. .1 becomes .2).
         my $nextlognumber = 0;
         foreach $lognumber (1 .. ($MAXLOGS - 1)) {
            $lognumber = $MAXLOGS - $lognumber; #Count backwards
            if(-e "$logfile.$lognumber") {
               $nextlognumber = $lognumber + 1;
               $command = "cp $logfile.$lognumber " .
                           "$logfile.$nextlognumber";
               $output = qx/$command/;
               dprint "$command\n";
               dprint "$output\n";
            }
         }

         # Archive the old log file
         $command = "cp -f $logfile $logfile.1";
         $output = qx/$command/;
         dprint "$command\n";
         dprint "$output\n";

         # Remove archived file
         dprint "unlink($logfile)\n";
         unlink($logfile);
      }
      dprint "-------------------------------------\n";
   }
}

# Restart the web servers. Assuming that ssh
# works without a password from this system to the
# target system.
#
sub restartServers {
   my $server = "Each server name goes here.";

   my $command;
   my $output;
   if($restartFlag == 0) {
      dprint "No logfiles were rotated...\n";
      dprint "Don't need to restart servers\n";
      return;
   }

   foreach $server (@SERVERS) {
      $command = "$SSH root\@$server $RESTART";
      $output = qx/$command/;
      dprint "$command\n";
      dprint "$output\n";
   }
}

# Get a list of the domains being logged
my %domains = getDomains();

# Check and create directories for webalizer output
checkdir(keys %domains);

# Run Webalizer for each domain
runWebalizer(%domains);

if($ROTATE_LOGS) {
   # Rotate the logs
   rotateLogs(%domains);

   # Restart the servers
   restartServers();
}