(file) Return to urlaliasbuilder.pl CVS log (file) (dir) Up to [RizwankCVS] / geekymedia_web / awstats-6.3 / tools

  1 rizwank 1.1 #!/usr/bin/perl
  2             #-------------------------------------------------------
  3             # Small script to auto-generate URL Alias files for 5.2+ AWStats
  4             # Requires two Perl modules below.
  5             # From original title-grabber.pl file
  6             # 		(Feedback/suggestions to: simonjw@users.sourceforge.net)
  7             # Modified by eldy@users.sourceforge.net
  8             # 
  9             # Note: If you want to retrieve document titles over SSL you must have OpenSSL and
 10             #       the Net::SSL(eay) Perl Module available.  This code will check that SSL is
 11             #		supported before attempting to retrieve via it.
 12             #-------------------------------------------------------
 13             use LWP::UserAgent;
 14             use HTML::TokeParser;
 15             
 16             use strict;no strict "refs";
 17             
 18             
 19             # variables, etc
 20             my $REVISION='$Revision: 1.6 $'; $REVISION =~ /\s(.*)\s/; $REVISION=$1;
 21             my $VERSION="1.0 (build $REVISION)";
 22 rizwank 1.1 
 23             ############### EDIT HERE ###############
 24             
 25             # you can set this manually if you will only grep one site
 26             my $SITECONFIG = "";
 27             
 28             # Where the default input is located.
 29             my $awStatsDataDir = "/var/lib/awstats";
 30             
 31             # Throttle HTTP requests - help avoid DoS-like results if on a quick network.
 32             # Number is the number of seconds to pause between requests. Set to zero for
 33             # no throttling.
 34             my $throttleRequestsTime = 0;
 35             
 36             # LWP settings
 37             # UA string passed to server.  You should add this to SkipUserAgents in the
 38             # awstats.conf file if you want to ignore hits from this code.
 39             my $userAgent = "urlaliasbuilder/$VERSION";
 40             # Put a sensible e-mail address here
 41             my $spiderOwner = "spider\@mydomain.com";
 42             
 43 rizwank 1.1 # Timeout (in seconds) for each HTTP request (increase on slow connections)
 44             my $getTimeOut = 2;
 45             # Proxy server to use when doing http/s - leave blank if you don't have one
 46             #my $proxyServer = "http://my.proxy.server:port/";
 47             my $proxyServer = "";
 48             # Hosts not to use a proxy for
 49             my @hostsNoProxy = ("host1","host1.my.domain.name");
 50             # Make sure we don't download multi-megabyte files! We need only head section
 51             my $maxDocSizeBytes = 4096; # number is bytes
 52             
 53             ############### DON'T EDIT BELOW HERE ###############
 54             
 55             # Don't edit these
 56             my $FILEMARKER1 = "BEGIN_SIDER";
 57             my $FILEMARKER2 = "END_SIDER";
 58             
 59             my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
 60             
 61             my $fullMonth = sprintf("%02d",$mon+1);
 62             my $fullYear = sprintf("%04d",$year+1900);
 63             
 64 rizwank 1.1 
 65             # ====== main ======
 66             
 67             # Change default value if options are used
 68             my $helpfound=0;
 69             my $nohosts=0;
 70             my $overwritedata=0;
 71             my $hostname="";
 72             my $useHTTPS=0;
 73             
 74             # Data file to open
 75             my $fileToOpen = $awStatsDataDir . "/awstats" . $fullMonth . $fullYear . ($SITECONFIG?".$SITECONFIG":"") . ".txt";
 76             # URL Alias file to open
 77             my $urlAliasFile = "urlalias" . ($SITECONFIG?".$SITECONFIG":"") . ".txt";
 78             
 79             for (0..@ARGV-1) {
 80             	if ($ARGV[$_] =~ /^-*urllistfile=([^\s&]+)/i) 	{ $fileToOpen="$1"; next; }
 81             	if ($ARGV[$_] =~ /^-*urlaliasfile=([^\s&]+)/i) 	{ $urlAliasFile="$1"; next; }
 82             	if ($ARGV[$_] =~ /^-*site=(.*)/i)      			{ $hostname="$1"; next; }
 83             	if ($ARGV[$_] =~ /^-*h/i)     		  			{ $helpfound=1; next; }
 84             	if ($ARGV[$_] =~ /^-*overwrite/i)     	 		{ $overwritedata=1; next; }
 85 rizwank 1.1 	if ($ARGV[$_] =~ /^-*secure/i)     	 			{ $useHTTPS=1; next; }	
 86             }
 87             
 88             # if no host information provided, we bomb out to usage
 89             if(! $hostname && ! $SITECONFIG) { $nohosts=1; }
 90             
 91             # if no hostname set (i.e. -site=) then we use the config value
 92             if(! $hostname && $SITECONFIG) { $hostname=$SITECONFIG; }
 93             
 94             # Show usage help
 95             my $DIR; my $PROG; my $Extension;
 96             ($DIR=$0) =~ s/([^\/\\]*)$//; ($PROG=$1) =~ s/\.([^\.]*)$//; $Extension=$1;
 97             if ($nohosts || $helpfound || ! @ARGV) {
 98             	print "\n----- $PROG $VERSION -----\n";
 99             	print ucfirst($PROG)." generates an 'urlalias' file from an input file.\n";
100             	print "The input file must contain a list of URLs (It can be an AWStats history file).\n";
101             	print "For each of thoose URLs, the script get the corresponding HTML page and catch the\n";
102             	print "header information (title), then it writes an output file that contains one line\n";
103             	print "for each URLs and several fields:\n";
104             	print "- The first field is the URL,\n";
105             	print "- The second is title caught from web page.\n";
106 rizwank 1.1 	print "This resulting file can be used by AWStats urlalias plugin.\n";
107             	print "\n";
108             	print "Usage:  $PROG.$Extension  -site=www.myserver.com  [options]\n";
109             	print "\n";
110             	print "The site parameter contains the web server to get the page from.\n";
111             	print "Where options are:\n";
112             	print "  -urllistfile=Input urllist file\n";
113             	print "    If this file is an AWStats history file then urlaliasbuilder will use the\n";
114             	print "    SIDER section of this file as its input URL's list.\n";
115             	print "  -urlaliasfile=Output urlalias file to build\n";
116             	print "  -overwrite    Overwrite output file if exists\n";
117             	print "  -secure       Use https protocol\n";
118             	print "\n";
119             	print "Example: $PROG.$Extension -site=www.someotherhost.com\n";
120             	print "\n";
121             	print "This is default configuration used when no option are provided on command line:\n";
122             	print "Input urllist file: $fileToOpen (overwritten by -urllistfile option)\n";
123             	print "Output urlalias file: $urlAliasFile (overwritten by -urlaliasfile option)\n";
124             	print "\n";	
125             	print "This script was written from Simon Waight original works title-grabber.pl.\n";
126             	print "\n";
127 rizwank 1.1 	exit 0;
128             }
129             
130             my @archivedKeys=();
131             my $counter = 0;
132             my $pageTitle = "";
133             
134             # only read the alias file if we want to do a comparison
135             # and append new items only (i.e. not overwrite)
136             if($overwritedata == 0) {
137             	open(FILE,$urlAliasFile);
138             	my @bits = ();
139             	while(<FILE>) {
140             		chomp $_; s/\r//;
141             		@bits=split(/\t/,$_);
142             		@archivedKeys[$counter]=@bits[0];
143             		$counter++;
144             		#print "key: " . @bits[0] . "\n";
145             	}
146             	close(FILE);
147             	@bits = ();
148 rizwank 1.1 }
149             
150             # open input file (might be an AWStats history data file)
151             print "Reading input file: $fileToOpen\n";
152             open(FILE,$fileToOpen) || die "Error: Can't open input urllist file $fileToOpen";
153             binmode FILE;
154             
155             my @field=();
156             my @addToAliasFile=();
157             my $addToAliasFileCount=0;
158             my $isawstatshistoryfile=0;
159             while (<FILE>) {
160             	chomp $_; s/\r//;
161             
162             	if ($_ =~ /^AWSTATS DATA FILE/) {
163             		print "This file looks like an AWStats history file. Searching URLs list...\n";
164             		$isawstatshistoryfile=1;
165             	}
166             
167             	# Split line out into fields
168             	@field=split(/\s+/,$_);
169 rizwank 1.1 	if (! $field[0]) { next; }
170             
171             	# If we're at the start of the URL section of file
172             	if (! $isawstatshistoryfile || $field[0] eq $FILEMARKER1)  {
173             
174             		$_=<FILE>;
175             		chomp $_; s/\r//;
176             
177             		my @field=split(/\s+/,$_);
178             		my $count=0;
179             		my $matched = 0;
180             		while ($field[0] ne $FILEMARKER2) {
181             			if ($field[0]) {
182             				# compare awstats data entry against urlalias entry
183             				# only if we don't just want to write current items
184             				# to the file (i.e. overwrite)
185             				if($overwritedata == 0) {
186             					foreach my $key (@archivedKeys) {
187             						if($field[0] eq $key) {
188             							$matched = 1;
189             							last;
190 rizwank 1.1 						}
191             					}
192             					# it's a new URL, so add to list of items to retrieve
193             					if($matched == 0) {
194             						@addToAliasFile[$addToAliasFileCount] = $field[0];
195             						$addToAliasFileCount++;
196             						#print "new: " . $field[0] . "\n"
197             					}
198             					$matched = 0;
199             				} else {
200             					# no comparison, so everything is 'new'
201             					@addToAliasFile[$addToAliasFileCount] = $field[0];
202             					$addToAliasFileCount++;
203             				}
204             			}
205             			$_=<FILE>;
206             			chomp $_; s/\r//;
207             			@field=split(/\s+/,$_);
208             		}
209             	}
210             }
211 rizwank 1.1 
212             close(FILE);
213             
214             if($addToAliasFileCount == 0) {
215             	print "Found no new documents.\n\n" ;
216             	exit();
217             }
218             
219             print "Found " . $addToAliasFileCount . " new documents with no alias.\n";
220             
221             my $fileOutput = "";
222             
223             print "Looking thoose pages on web site '$hostname' to get alias...\n";
224             
225             # Create a user agent (browser) object
226             my $ua = new LWP::UserAgent;
227             # set user agent name
228             $ua->agent($userAgent);
229             # set user agents owners e-mail address
230             $ua->from($spiderOwner);
231             # set timeout for requests
232 rizwank 1.1 $ua->timeout($getTimeOut);
233             if ($proxyServer) {
234             	# set proxy for access to external sites
235             	$ua->proxy(["http","https"],$proxyServer);
236             	# avoid proxy for these hosts
237             	$ua->no_proxy(@hostsNoProxy);
238             }
239             # set maximum size of document to retrieve (in bytes)
240             $ua->max_size($maxDocSizeBytes);
241             if(!($ua->is_protocol_supported('https')) && $useHTTPS) {
242             	print "SSL is not supported on this machine.\n\n";
243             	exit();
244             }
245             
246             my $fileOutput = "";
247             
248             # Now lets build the contents to write (or append) to urlalias file
249             foreach my $newAlias (@addToAliasFile) {
250             	sleep $throttleRequestsTime;
251             	my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);
252             	$fileOutput .= $newAliasEntry . "\n";
253 rizwank 1.1 }
254             
255             # write the data back to urlalias file
256             if (! $overwritedata) {
257             	# Append to file
258             	open(FILE,">>$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n";
259             	print FILE $fileOutput;
260             	close(FILE);
261             } else {
262             	# Overwrite the file
263             	open(FILE,">$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n";
264             	foreach my $newAlias (@addToAliasFile) {
265             		my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);
266             		print FILE "$newAliasEntry\n";
267             	}
268             	close(FILE);
269             }
270             print "File $urlAliasFile created/updated.\n";
271             
272             exit();
273             
274 rizwank 1.1 #--------------------------- End of Main -----------------------------
275             
276             
277             #
278             # Generate new lines for urlalias file by doing a http get using data
279             # supplied.
280             #
281             sub Generate_Alias_List_Entry {
282             
283             	# take in the path & document
284             	my $urltoget = shift;
285             
286             	my $urlPrefix = "http://";
287             	
288             	if($useHTTPS) {
289             		$urlPrefix = "https://";
290             	}
291             
292             	my $AliasLine = "";
293             	$pageTitle = "";
294             	$AliasLine = $urltoget;
295 rizwank 1.1 	$AliasLine .= "\t";
296             
297             	# build a full HTTP request to pass to user agent
298             	my $fullurltoget = $urlPrefix . $hostname . $urltoget;
299             
300             	# Create a HTTP request
301             	print "Getting page $fullurltoget\n";
302             		
303             	my $req = new HTTP::Request GET => $fullurltoget;
304             
305             	# Pass request to the user agent and get a response back
306             	my $res = $ua->request($req);
307             
308             	# Parse returned document for page title
309             	if ($res->is_success()) {
310             		my $htmldoc = $res->content;
311             		my $p = HTML::Parser->new(api_version => 3);
312             		$p->handler( start => \&title_handler, "tagname,self");
313             		$p->parse($htmldoc);
314             	} else {
315             		print "Failed to get page: ".$res->status_line."\n";
316 rizwank 1.1 		$pageTitle = "Unknown Title";
317             	}
318             	if ($pageTitle eq "") {
319             		$pageTitle = "Unknown Title";
320             	}
321             	return $AliasLine . $pageTitle;
322             }
323             
324             # Handler routine for HTML::Parser
325             sub title_handler {
326             	return if shift ne "title";
327             	my $self = shift;
328             	$self->handler(text => sub { $pageTitle = shift }, "dtext");
329             	$self->handler(end  => sub { shift->eof if shift eq "title"; },"tagname,self");
330             }

Rizwan Kassim
Powered by
ViewCVS 0.9.2