--- /dev/null
+#!/usr/bin/perl\r
+#-Description-------------------------------------------\r
+# Small script to auto-generate URL Alias files for 5.2+ AWStats\r
+# Requires two Perl modules below.\r
+# Note: Doesn't currently support https.\r
+# From original title-grabber.pl file (Feedback to: simonjw@users.sourceforge.net)\r
+# Changed by eldy@users.sourceforge.net\r
+#-------------------------------------------------------\r
+use LWP::UserAgent;\r
+use HTML::TokeParser;\r
+\r
+use strict;no strict "refs";\r
+\r
+# variables, etc\r
+my $REVISION='$Revision$'; $REVISION =~ /\s(.*)\s/; $REVISION=$1;\r
+my $VERSION="0.91 (build $REVISION)";\r
+\r
+my $SITECONFIG = "";\r
+my $FILEMARKER1 = "BEGIN_SIDER";\r
+my $FILEMARKER2 = "END_SIDER";\r
+\r
+my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);\r
+\r
+my $fullMonth = $mon + 1;\r
+my $fullYear = $year + 1900;\r
+\r
+# Where everything we need to know is installed\r
+my $awStatsDataDir = "/var/cache/awstats";\r
+my $awStatsOutDir = "/opt/awstats/wwwroot/cgi-bin/plugins";\r
+\r
+# LWP settings\r
+# UA string passed to server. You should add this to SkipUserAgents in the\r
+# awstats.conf file if you want to ignore hits from this code.\r
+my $userAgent = "awstats-title-grabber/0.9";\r
+# Put a sensible e-mail address here\r
+my $spiderOwner = "email\@my.domain.name";\r
+# Timeout (in seconds) for each HTTP request (increase on slow connections)\r
+my $getTimeOut = 2;\r
+# Proxy server to use when doing http/s - leave blank if you don't have one\r
+#my $proxyServer = "http://my.proxy.server:port/";\r
+my $proxyServer = "";\r
+# Hosts not to use a proxy for\r
+my @hostsNoProxy = ("host1","host1.my.domain.name");\r
+\r
+# ====== main\r
+\r
+# Change default value if options are used\r
+my $helpfound=0;\r
+my $nohosts=0;\r
+my $overwritedata=0;\r
+my $hostname="";\r
+\r
+# Data file to open\r
+my $fileToOpen = $awStatsDataDir . "/awstats" . $fullMonth . $fullYear . ($SITECONFIG?".$SITECONFIG":"") . ".txt";\r
+# URL Alias file to open\r
+my $urlAliasFile = "urlalias" . ($SITECONFIG?".$SITECONFIG":"") . ".txt";\r
+\r
+for (0..@ARGV-1) {\r
+ if ($ARGV[$_] =~ /^-*historyfile=([^\s&]+)/i) { $fileToOpen="$1"; next; }\r
+ if ($ARGV[$_] =~ /^-*urlaliasfile=([^\s&]+)/i) { $urlAliasFile="$1"; next; }\r
+ if ($ARGV[$_] =~ /^-*server=(.*)/i) { $hostname="$1"; next; }\r
+ if ($ARGV[$_] =~ /^-*h/i) { $helpfound=1; next; }\r
+ if ($ARGV[$_] =~ /^-*overwrite/i) { $overwritedata=1; next; }\r
+}\r
+\r
+# if no host information provided, we bomb out to usage\r
+if(($hostname eq "") && ($SITECONFIG eq "")) {\r
+ $nohosts=1;\r
+}\r
+\r
+# if no hostname set (i.e. -server=) then we use the config value\r
+if(($hostname eq "") && ($SITECONFIG ne "")) {\r
+ $hostname=$SITECONFIG;\r
+}\r
+\r
+# Show usage help\r
+my $DIR; my $PROG; my $Extension;\r
+($DIR=$0) =~ s/([^\/\\]*)$//; ($PROG=$1) =~ s/\.([^\.]*)$//; $Extension=$1;\r
+if ($nohosts || $helpfound || ! @ARGV) {\r
+ print "\n----- $PROG $VERSION -----\n";\r
+ print "$PROG generates urlalias file for the supplied site configuration.\n";\r
+ print "It uses an AWStats history data file as a source.\n";\r
+ print "If you need to use a hostname for retrieving page headers other than the\n";\r
+ print "one read from the config file, pass it using the -server option.\n";\r
+ print "If you pass the -overwrite option the urlaliases file will only ever\n";\r
+ print "contain the current set of active page titles (as most recently generated\n";\r
+ print "by AWStats).\n";\r
+ print "\n";\r
+ print "Usage: $PROG.$Extension -server=www.myserver.com [options]\n";\r
+ print "\n";\r
+ print "Where options are:\n";\r
+ print " -historyfile=AWStats input history file name\n";\r
+ print " -urlaliasfile=AWStats output urlalias file to build\n";\r
+ print " -overwrite\n";\r
+ print "\n";\r
+ print "Example: $PROG.$Extension -server=www.someotherhost.com\n";\r
+ print "\n";\r
+ print "This is default configuration used if no option is used on command line:\n";\r
+ print "AWStats input history file: $fileToOpen (overwritten by -historyfile option)\n";\r
+ print "AWStats output urlalias file: $urlAliasFile (overwritten by -urlaliasfile option)\n";\r
+ print "\n"; \r
+ exit 0;\r
+}\r
+\r
+my @archivedKeys=();\r
+my $counter = 0;\r
+my $pageTitle = "";\r
+\r
+# only read the alias file if we want to do a comparison\r
+# and append new items only (i.e. not overwrite)\r
+if($overwritedata == 0) {\r
+ open(FILE,$urlAliasFile);\r
+ my @bits = ();\r
+ while(<FILE>) {\r
+ chomp $_; s/\r//;\r
+ @bits=split(/\t/,$_);\r
+ @archivedKeys[$counter]=@bits[0];\r
+ $counter++;\r
+ #print "key: " . @bits[0] . "\n";\r
+ }\r
+ close(FILE);\r
+ @bits = ();\r
+}\r
+\r
+# open current months AWStats data file\r
+print "Open input file $fileToOpen\n";\r
+open(FILE,$fileToOpen) || die "Error: Can't open AWStats input history file $fileToOpen";\r
+binmode FILE;\r
+\r
+my @field=();\r
+my @addToAliasFile=();\r
+my $addToAliasFileCount = 0;\r
+while (<FILE>) {\r
+ chomp $_; s/\r//;\r
+\r
+ # Split line out into fields\r
+ @field=split(/\s+/,$_);\r
+ if (! $field[0]) { next; }\r
+\r
+ # If we're at the start of the URL section of file\r
+ if ($field[0] eq $FILEMARKER1) {\r
+\r
+ $_=<FILE>;\r
+ chomp $_; s/\r//;\r
+\r
+ my @field=split(/\s+/,$_);\r
+ my $count=0;\r
+ my $matched = 0;\r
+ while ($field[0] ne $FILEMARKER2) {\r
+ if ($field[0]) {\r
+ # compare awstats data entry against urlalias entry\r
+ # only if we don't just want to write current items\r
+ # to the file (i.e. overwrite)\r
+ if($overwritedata == 0) {\r
+ foreach my $key (@archivedKeys) {\r
+ if($field[0] eq $key) {\r
+ $matched = 1;\r
+ last;\r
+ }\r
+ }\r
+ # it's a new URL, so add to list of items to retrieve\r
+ if($matched == 0) {\r
+ @addToAliasFile[$addToAliasFileCount] = $field[0];\r
+ $addToAliasFileCount++;\r
+ #print "new: " . $field[0] . "\n"\r
+ }\r
+ $matched = 0;\r
+ } else {\r
+ # no comparison, so everything is 'new'\r
+ @addToAliasFile[$addToAliasFileCount] = $field[0];\r
+ $addToAliasFileCount++;\r
+ }\r
+ }\r
+ $_=<FILE>;\r
+ chomp $_; s/\r//;\r
+ @field=split(/\s+/,$_);\r
+ }\r
+ }\r
+}\r
+\r
+print "Found " . $addToAliasFileCount . " new URLs with no alias.\n";\r
+\r
+close(FILE);\r
+\r
+my $fileOutput = "";\r
+\r
+print "Looking thoose pages to get alias...\n";\r
+\r
+# Create a user agent (browser) object\r
+my $ua = new LWP::UserAgent;\r
+# set user agent name\r
+$ua->agent($userAgent);\r
+# set user agents owners e-mail address\r
+$ua->from($spiderOwner);\r
+# set timeout for requests\r
+$ua->timeout($getTimeOut);\r
+if ($proxyServer ne "") {\r
+ # set proxy for access to external sites\r
+ $ua->proxy(["http","https"],$proxyServer);\r
+ # avoid proxy for these hosts\r
+ $ua->no_proxy(@hostsNoProxy);\r
+}\r
+\r
+# Now lets build the contents to write (or append) to urlalias file\r
+foreach my $newAlias (@addToAliasFile) {\r
+ my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);\r
+ $fileOutput .= $newAliasEntry . "\n";\r
+}\r
+\r
+# write the data back to urlalias file\r
+print "Wirte file $urlAliasFile\n";\r
+if($overwritedata == 0) {\r
+ # append to file\r
+ open(FILE,">>$urlAliasFile") || die "Error: Failed to open file for writing: $_";;\r
+ print FILE $fileOutput;\r
+ close(FILE);\r
+} else {\r
+ # overwrite the file\r
+ open(FILE,">$urlAliasFile") || die "Error: Failed to open file for writing: $_";;\r
+ print FILE $fileOutput;\r
+ close(FILE);\r
+}\r
+\r
+exit();\r
+#--------------------------- End of Main -----------------------------\r
+\r
+\r
+#\r
+# Generate new lines for urlalias file by doing a http get using data\r
+# supplied.\r
+#\r
+sub Generate_Alias_List_Entry {\r
+\r
+ # take in the path & document\r
+ my $urltoget = shift;\r
+\r
+ my $AliasLine = "";\r
+ $pageTitle = "";\r
+ $AliasLine = $urltoget;\r
+ $AliasLine .= "\t";\r
+\r
+ # build a full HTTP request to pass to user agent\r
+ my $fullurltoget = "http://" . $hostname . $urltoget;\r
+\r
+ #print $fullurltoget . "\n";\r
+\r
+ # Create a HTTP request\r
+ print "Download page $fullurltoget\n";\r
+ my $req = new HTTP::Request GET => $fullurltoget;\r
+\r
+ # Pass request to the user agent and get a response back\r
+ my $res = $ua->request($req);\r
+\r
+ # Parse returned document for page title\r
+ if ($res->is_success()) {\r
+ my $htmldoc = $res->content;\r
+ my $p = HTML::Parser->new(api_version => 3);\r
+ $p->handler( start => \&title_handler, "tagname,self");\r
+ $p->parse($htmldoc);\r
+ } else {\r
+ print "Failed to get page: ".$res->status_line."\n";\r
+ $pageTitle = "Unknown Title";\r
+ }\r
+ if ($pageTitle eq "") {\r
+ $pageTitle = "Unknown Title";\r
+ }\r
+ return $AliasLine . $pageTitle;\r
+}\r
+\r
+# Handler routine for HTML::Parser\r
+sub title_handler {\r
+ return if shift ne "title";\r
+ my $self = shift;\r
+ $self->handler(text => sub { $pageTitle = shift }, "dtext");\r
+ $self->handler(end => sub { shift->eof if shift eq "title"; },"tagname,self");\r
+}
\ No newline at end of file