1 rizwank 1.1 #!/usr/bin/perl
2 #-------------------------------------------------------
3 # Small script to auto-generate URL Alias files for 5.2+ AWStats
4 # Requires two Perl modules below.
5 # From original title-grabber.pl file
6 # (Feedback/suggestions to: simonjw@users.sourceforge.net)
7 # Modified by eldy@users.sourceforge.net
8 #
9 # Note: If you want to retrieve document titles over SSL you must have OpenSSL and
10 # the Net::SSL(eay) Perl Module available. This code will check that SSL is
11 # supported before attempting to retrieve via it.
12 #-------------------------------------------------------
13 use LWP::UserAgent;
14 use HTML::TokeParser;
15
16 use strict;no strict "refs";
17
18
19 # variables, etc
20 my $REVISION='$Revision: 1.6 $'; $REVISION =~ /\s(.*)\s/; $REVISION=$1;
21 my $VERSION="1.0 (build $REVISION)";
22 rizwank 1.1
23 ############### EDIT HERE ###############
24
25 # you can set this manually if you will only grep one site
26 my $SITECONFIG = "";
27
28 # Where the default input is located.
29 my $awStatsDataDir = "/var/lib/awstats";
30
31 # Throttle HTTP requests - help avoid DoS-like results if on a quick network.
32 # Number is the number of seconds to pause between requests. Set to zero for
33 # no throttling.
34 my $throttleRequestsTime = 0;
35
36 # LWP settings
37 # UA string passed to server. You should add this to SkipUserAgents in the
38 # awstats.conf file if you want to ignore hits from this code.
39 my $userAgent = "urlaliasbuilder/$VERSION";
40 # Put a sensible e-mail address here
41 my $spiderOwner = "spider\@mydomain.com";
42
43 rizwank 1.1 # Timeout (in seconds) for each HTTP request (increase on slow connections)
44 my $getTimeOut = 2;
45 # Proxy server to use when doing http/s - leave blank if you don't have one
46 #my $proxyServer = "http://my.proxy.server:port/";
47 my $proxyServer = "";
48 # Hosts not to use a proxy for
49 my @hostsNoProxy = ("host1","host1.my.domain.name");
50 # Make sure we don't download multi-megabyte files! We need only head section
51 my $maxDocSizeBytes = 4096; # number is bytes
52
53 ############### DON'T EDIT BELOW HERE ###############
54
55 # Don't edit these
56 my $FILEMARKER1 = "BEGIN_SIDER";
57 my $FILEMARKER2 = "END_SIDER";
58
59 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
60
61 my $fullMonth = sprintf("%02d",$mon+1);
62 my $fullYear = sprintf("%04d",$year+1900);
63
64 rizwank 1.1
65 # ====== main ======
66
67 # Change default value if options are used
68 my $helpfound=0;
69 my $nohosts=0;
70 my $overwritedata=0;
71 my $hostname="";
72 my $useHTTPS=0;
73
74 # Data file to open
75 my $fileToOpen = $awStatsDataDir . "/awstats" . $fullMonth . $fullYear . ($SITECONFIG?".$SITECONFIG":"") . ".txt";
76 # URL Alias file to open
77 my $urlAliasFile = "urlalias" . ($SITECONFIG?".$SITECONFIG":"") . ".txt";
78
79 for (0..@ARGV-1) {
80 if ($ARGV[$_] =~ /^-*urllistfile=([^\s&]+)/i) { $fileToOpen="$1"; next; }
81 if ($ARGV[$_] =~ /^-*urlaliasfile=([^\s&]+)/i) { $urlAliasFile="$1"; next; }
82 if ($ARGV[$_] =~ /^-*site=(.*)/i) { $hostname="$1"; next; }
83 if ($ARGV[$_] =~ /^-*h/i) { $helpfound=1; next; }
84 if ($ARGV[$_] =~ /^-*overwrite/i) { $overwritedata=1; next; }
85 rizwank 1.1 if ($ARGV[$_] =~ /^-*secure/i) { $useHTTPS=1; next; }
86 }
87
88 # if no host information provided, we bomb out to usage
89 if(! $hostname && ! $SITECONFIG) { $nohosts=1; }
90
91 # if no hostname set (i.e. -site=) then we use the config value
92 if(! $hostname && $SITECONFIG) { $hostname=$SITECONFIG; }
93
94 # Show usage help
95 my $DIR; my $PROG; my $Extension;
96 ($DIR=$0) =~ s/([^\/\\]*)$//; ($PROG=$1) =~ s/\.([^\.]*)$//; $Extension=$1;
97 if ($nohosts || $helpfound || ! @ARGV) {
98 print "\n----- $PROG $VERSION -----\n";
99 print ucfirst($PROG)." generates an 'urlalias' file from an input file.\n";
100 print "The input file must contain a list of URLs (It can be an AWStats history file).\n";
101 print "For each of thoose URLs, the script get the corresponding HTML page and catch the\n";
102 print "header information (title), then it writes an output file that contains one line\n";
103 print "for each URLs and several fields:\n";
104 print "- The first field is the URL,\n";
105 print "- The second is title caught from web page.\n";
106 rizwank 1.1 print "This resulting file can be used by AWStats urlalias plugin.\n";
107 print "\n";
108 print "Usage: $PROG.$Extension -site=www.myserver.com [options]\n";
109 print "\n";
110 print "The site parameter contains the web server to get the page from.\n";
111 print "Where options are:\n";
112 print " -urllistfile=Input urllist file\n";
113 print " If this file is an AWStats history file then urlaliasbuilder will use the\n";
114 print " SIDER section of this file as its input URL's list.\n";
115 print " -urlaliasfile=Output urlalias file to build\n";
116 print " -overwrite Overwrite output file if exists\n";
117 print " -secure Use https protocol\n";
118 print "\n";
119 print "Example: $PROG.$Extension -site=www.someotherhost.com\n";
120 print "\n";
121 print "This is default configuration used when no option are provided on command line:\n";
122 print "Input urllist file: $fileToOpen (overwritten by -urllistfile option)\n";
123 print "Output urlalias file: $urlAliasFile (overwritten by -urlaliasfile option)\n";
124 print "\n";
125 print "This script was written from Simon Waight original works title-grabber.pl.\n";
126 print "\n";
127 rizwank 1.1 exit 0;
128 }
129
130 my @archivedKeys=();
131 my $counter = 0;
132 my $pageTitle = "";
133
134 # only read the alias file if we want to do a comparison
135 # and append new items only (i.e. not overwrite)
136 if($overwritedata == 0) {
137 open(FILE,$urlAliasFile);
138 my @bits = ();
139 while(<FILE>) {
140 chomp $_; s/\r//;
141 @bits=split(/\t/,$_);
142 @archivedKeys[$counter]=@bits[0];
143 $counter++;
144 #print "key: " . @bits[0] . "\n";
145 }
146 close(FILE);
147 @bits = ();
148 rizwank 1.1 }
149
150 # open input file (might be an AWStats history data file)
151 print "Reading input file: $fileToOpen\n";
152 open(FILE,$fileToOpen) || die "Error: Can't open input urllist file $fileToOpen";
153 binmode FILE;
154
155 my @field=();
156 my @addToAliasFile=();
157 my $addToAliasFileCount=0;
158 my $isawstatshistoryfile=0;
159 while (<FILE>) {
160 chomp $_; s/\r//;
161
162 if ($_ =~ /^AWSTATS DATA FILE/) {
163 print "This file looks like an AWStats history file. Searching URLs list...\n";
164 $isawstatshistoryfile=1;
165 }
166
167 # Split line out into fields
168 @field=split(/\s+/,$_);
169 rizwank 1.1 if (! $field[0]) { next; }
170
171 # If we're at the start of the URL section of file
172 if (! $isawstatshistoryfile || $field[0] eq $FILEMARKER1) {
173
174 $_=<FILE>;
175 chomp $_; s/\r//;
176
177 my @field=split(/\s+/,$_);
178 my $count=0;
179 my $matched = 0;
180 while ($field[0] ne $FILEMARKER2) {
181 if ($field[0]) {
182 # compare awstats data entry against urlalias entry
183 # only if we don't just want to write current items
184 # to the file (i.e. overwrite)
185 if($overwritedata == 0) {
186 foreach my $key (@archivedKeys) {
187 if($field[0] eq $key) {
188 $matched = 1;
189 last;
190 rizwank 1.1 }
191 }
192 # it's a new URL, so add to list of items to retrieve
193 if($matched == 0) {
194 @addToAliasFile[$addToAliasFileCount] = $field[0];
195 $addToAliasFileCount++;
196 #print "new: " . $field[0] . "\n"
197 }
198 $matched = 0;
199 } else {
200 # no comparison, so everything is 'new'
201 @addToAliasFile[$addToAliasFileCount] = $field[0];
202 $addToAliasFileCount++;
203 }
204 }
205 $_=<FILE>;
206 chomp $_; s/\r//;
207 @field=split(/\s+/,$_);
208 }
209 }
210 }
211 rizwank 1.1
212 close(FILE);
213
214 if($addToAliasFileCount == 0) {
215 print "Found no new documents.\n\n" ;
216 exit();
217 }
218
219 print "Found " . $addToAliasFileCount . " new documents with no alias.\n";
220
221 my $fileOutput = "";
222
223 print "Looking thoose pages on web site '$hostname' to get alias...\n";
224
225 # Create a user agent (browser) object
226 my $ua = new LWP::UserAgent;
227 # set user agent name
228 $ua->agent($userAgent);
229 # set user agents owners e-mail address
230 $ua->from($spiderOwner);
231 # set timeout for requests
232 rizwank 1.1 $ua->timeout($getTimeOut);
233 if ($proxyServer) {
234 # set proxy for access to external sites
235 $ua->proxy(["http","https"],$proxyServer);
236 # avoid proxy for these hosts
237 $ua->no_proxy(@hostsNoProxy);
238 }
239 # set maximum size of document to retrieve (in bytes)
240 $ua->max_size($maxDocSizeBytes);
241 if(!($ua->is_protocol_supported('https')) && $useHTTPS) {
242 print "SSL is not supported on this machine.\n\n";
243 exit();
244 }
245
246 my $fileOutput = "";
247
248 # Now lets build the contents to write (or append) to urlalias file
249 foreach my $newAlias (@addToAliasFile) {
250 sleep $throttleRequestsTime;
251 my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);
252 $fileOutput .= $newAliasEntry . "\n";
253 rizwank 1.1 }
254
255 # write the data back to urlalias file
256 if (! $overwritedata) {
257 # Append to file
258 open(FILE,">>$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n";
259 print FILE $fileOutput;
260 close(FILE);
261 } else {
262 # Overwrite the file
263 open(FILE,">$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n";
264 foreach my $newAlias (@addToAliasFile) {
265 my $newAliasEntry = &Generate_Alias_List_Entry($newAlias);
266 print FILE "$newAliasEntry\n";
267 }
268 close(FILE);
269 }
270 print "File $urlAliasFile created/updated.\n";
271
272 exit();
273
274 rizwank 1.1 #--------------------------- End of Main -----------------------------
275
276
277 #
278 # Generate new lines for urlalias file by doing a http get using data
279 # supplied.
280 #
281 sub Generate_Alias_List_Entry {
282
283 # take in the path & document
284 my $urltoget = shift;
285
286 my $urlPrefix = "http://";
287
288 if($useHTTPS) {
289 $urlPrefix = "https://";
290 }
291
292 my $AliasLine = "";
293 $pageTitle = "";
294 $AliasLine = $urltoget;
295 rizwank 1.1 $AliasLine .= "\t";
296
297 # build a full HTTP request to pass to user agent
298 my $fullurltoget = $urlPrefix . $hostname . $urltoget;
299
300 # Create a HTTP request
301 print "Getting page $fullurltoget\n";
302
303 my $req = new HTTP::Request GET => $fullurltoget;
304
305 # Pass request to the user agent and get a response back
306 my $res = $ua->request($req);
307
308 # Parse returned document for page title
309 if ($res->is_success()) {
310 my $htmldoc = $res->content;
311 my $p = HTML::Parser->new(api_version => 3);
312 $p->handler( start => \&title_handler, "tagname,self");
313 $p->parse($htmldoc);
314 } else {
315 print "Failed to get page: ".$res->status_line."\n";
316 rizwank 1.1 $pageTitle = "Unknown Title";
317 }
318 if ($pageTitle eq "") {
319 $pageTitle = "Unknown Title";
320 }
321 return $AliasLine . $pageTitle;
322 }
323
324 # Handler routine for HTML::Parser
325 sub title_handler {
326 return if shift ne "title";
327 my $self = shift;
328 $self->handler(text => sub { $pageTitle = shift }, "dtext");
329 $self->handler(end => sub { shift->eof if shift eq "title"; },"tagname,self");
330 }
|