]> git.ipfire.org Git - thirdparty/squid.git/blob - helpers/log_daemon/DB/log_db_daemon.pl.in
Merged from parent (trunk r11240, circa 3.2.0.5+)
[thirdparty/squid.git] / helpers / log_daemon / DB / log_db_daemon.pl.in
1 #!@PERL@
2 use strict;
3 use warnings;
4 use DBI;
5 use English qw( -no_match_vars );
6 use Getopt::Long;
7 use Pod::Usage;
8
9 $|=1;
10
11 =pod
12
13 =head1 NAME
14
15 log_db_daemon - Database logging daemon for Squid
16
17 =head1 SYNOPSIS
18
19 log_db_daemon DSN [options]
20
21 =head1 DESCRIPTOIN
22
23 This program writes Squid access.log entries to a database.
24 Presently only accepts the B<squid> native format
25
26 =over 8
27
28 =item B<DSN>
29
30 Database DSN encoded as a path. This is sent as the access_log file path.
31
32 Sample configuration:
33 access_log daemon:/host/database/table/username/password squid
34
35 to leave a parameter unspecified use a double slash:
36 access_log daemon://database/table/username/password squid
37
38 Default "DBI:mysql:database=squid"
39
40 =item B<--debug>
41
42 Write debug messages to Squid stderr or cache.log
43
44 =cut
45
46 # the first argument to this script is the log file path describing the DSN
47 my $log_file = shift;
48
49 # others may be options
50 my $debug = 0;
51 GetOptions(
52 'debug' => \$debug,
53 );
54
55
56 # utility routine to print messages on stderr (so they appear in cache log)
57 # without using warn, which would clutter the log with source line numbers
58 sub log_info {
59 my $msg = shift;
60 print STDERR "$msg\n";
61 }
62
63 # we use logfile to pass database access information to this script
64 # sample configuration:
65 # access_log daemon:/host/database/table/username/password squid
66 # to let a parmeter unspecified, e.g. the database host, use a double slash:
67 # access_log daemon://database/table/username/password squid
68 my ( $host, $database, $table, $user, $pass ) = $log_file =~ / \/(.*?)\/(.*?)\/(.*?)\/(.*?)\/(.*?) \z /xms;
69
70 if ( !$host ) {
71 $host = 'localhost';
72 log_info("Database host not specified. Using $host.");
73 }
74
75 if ( !$database ) {
76 $database = 'squid_log';
77 log_info("Database name not specified. Using $database.");
78 }
79
80 if ( !$table ) {
81 $table = 'access_log';
82 log_info("Table parameter not specified. Using $table.");
83 }
84
85 if ( !$user ) {
86 $user = 'squid';
87 log_info("User parameter not specified. Using $user.");
88 }
89
90 if ( !$pass ) {
91 log_info('No password specified. Connecting with NO password.');
92 }
93
94 # fields that we should have in the table
95 # Order here must match the order of fields in the Log format and parse() output array.
96 my @db_fields = qw(
97 id
98 time_since_epoch
99 time_response
100 ip_client
101 squid_request_status
102 http_status_code
103 http_reply_size
104 http_method
105 http_url
106 http_username
107 squid_hier_status
108 ip_server
109 http_mime_type
110 );
111
112 # perform db connection
113 my $dsn = "DBI:mysql:database=$database" . ($host ne "localhost" ? ":$host" : "");
114 my $dbh;
115 my $sth;
116 eval {
117 warn "Connecting... dsn='$dsn', username='$user', password='...'";
118 $dbh = DBI->connect($dsn, $user, $pass, { AutoCommit => 1, RaiseError => 1, PrintError => 1 });
119 };
120 if ($EVAL_ERROR) {
121 die "Cannot connect to database: $DBI::errstr";
122 }
123
124
125 # a simple test to assure the specified table exists
126 eval {
127 my $q = 'SELECT ' . join(',',@db_fields) . " FROM $table LIMIT 1";
128 my $sth = $dbh->prepare($q);
129 $sth->execute;
130 };
131 if ($EVAL_ERROR) {
132 # run a query to create the table of required syntax
133 my $create_query = 'CREATE TABLE ' . $table . ' (' .
134 " id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY," .
135 " time_since_epoch DECIMAL(15,3)," .
136 " time_response INTEGER," .
137 " ip_client CHAR(15)," .
138 " ip_server CHAR(15)," .
139 " http_status_code VARCHAR(10)," .
140 " http_reply_size INTEGER," .
141 " http_method VARCHAR(20)," .
142 " http_url TEXT," .
143 " http_username VARCHAR(20)," .
144 " http_mime_type VARCHAR(50)," .
145 " squid_request_status VARCHAR(50)," .
146 " squid_hier_status VARCHAR(20)" .
147 ");" ;
148
149 $sth = $dbh->prepare($create_query);
150 $sth->execute;
151 # test again and fail hard if it is still broken.
152 eval {
153 my $q = 'SELECT ' . join(',',@db_fields) . " FROM $table LIMIT 1";
154 my $sth = $dbh->prepare($q);
155 $sth->execute;
156 };
157 if ($EVAL_ERROR) {
158 die "Error initializing database table: $EVAL_ERROR";
159 };
160 }
161 # test
162
163 # for better performance, prepare the statement at startup
164 eval {
165 my $q = "INSERT INTO $table (" . join(',',@db_fields) . ") VALUES(NULL" . ',?' x (scalar(@db_fields)-1) . ')';
166 #$sth = $dbh->prepare("INSERT INTO $table VALUES(NULL,?,?,?,?,?,?,?,?,?,?,?,?)");
167 $sth = $dbh->prepare($q);
168 };
169 if ($EVAL_ERROR) {
170 die "Error while preparing sql statement: $EVAL_ERROR";
171 }
172
173 sub parse($) {
174 my ($line) = @_;
175 my (@t) = $line =~ /^L(\d+\.\d+) *(\d+?) (.*?) (.*?)\/(\d+?) (\d+?) (.*?) (.*?) (.*?) (.*?)\/(.*?) (.*)$/;
176 }
177
178 # main loop
179 while (my $line = <>) {
180 chomp $line;
181
182 my $cmd = substr($line, 0, 1); # extract command byte
183
184 if ( $cmd eq 'L' ) {
185 my @log_entry = parse($line);
186 eval { # we catch db errors to avoid crashing squid in case something goes wrong...
187 $sth->execute(@log_entry) or die $sth->errstr
188 };
189 if ( $EVAL_ERROR ) { # leave a trace of the error in the logs
190 warn $EVAL_ERROR . " values=(" . join(', ', @log_entry) . ')';
191 }
192 }
193 }
194
195 $dbh->disconnect();
196
197 __END__
198
199 =head1 DESCRIPTION
200
201 This module exploits the new logfile daemon support available in squid 2.7 and 3.2 to store access log entries in a MySQL database.
202
203 =head1 CONFIGURATION
204
205 =head2 Squid configuration
206
207 =head3 access_log directive
208
209 The path to the access log file is used to provide the database connection parameters.
210
211 access_log daemon:/mysql_host:port/database/table/username/password squid
212
213 The 'daemon' prefix is mandatory and tells squid that the B<logfile_daemon> helper is to be used instead of the normal file logging.
214
215 The last parameter tells squid which log format to use when writing lines to the log daemon.
216 Presently B<squid> format is supported.
217
218 =over 4
219
220 =item mysql_host:port
221
222 Host where the mysql server is running. If left empty, 'localhost' is assumed.
223
224 =item database
225
226 Name of the database to connect to. If left empty, 'squid_log' is assumed.
227
228 =item table
229
230 Name of the database table where log lines are stored. If left empty, 'access_log' is assumed.
231
232 =item username
233
234 Username to use when connecting to the database. If left empty, 'squid' is assumed.
235
236 =item password
237
238 Password to use when connecting to the database. If left empty, no password is used.
239
240 =back
241
242 To leave all fields to their default values, you can use a single slash:
243
244 access_log daemon:/ squid
245
246 To specify only the database password, which by default is empty, you must leave unspecified all the other parameters by using null strings:
247
248 access_log daemon://///password squid
249
250 =head3 logfile_daemon directive
251
252 This is the current way of telling squid where the logfile daemon resides.
253
254 logfile_daemon /path/to/squid/libexec/logfile-daemon_mysql.pl
255
256 The script must be copied to the location specified in the directive.
257
258 =head2 Database configuration
259
260 Let's call the database 'squid_log' and the log table 'access_log'. The username and password for the db connection will be both 'squid'.
261
262 =head3 Database
263
264 Create the database:
265
266 CREATE DATABASE squid_log;
267
268 =head3 User
269
270 Create the user:
271
272 GRANT INSERT,SELECT,CREATE ON squid_log.* TO 'squid'@'localhost' IDENTIFIED BY 'squid';
273 FLUSH PRIVILEGES;
274
275 Note that only CREATE, INSERT and SELECT privileges are granted to the 'squid' user. This ensures that the logfile daemon script cannot change or modify the log entries.
276
277 =head3 Table
278
279 The Daemon will attempt to initialize this table if none exists when it starts.
280
281 The table created should look like:
282
283 CREATE TABLE access_log (
284 id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
285 time_since_epoch DECIMAL(15,3),
286 time_response INTEGER,
287 ip_client CHAR(15),
288 ip_server CHAR(15),
289 http_status_code VARCHAR(10),
290 http_reply_size INTEGER,
291 http_method VARCHAR(20),
292 http_url TEXT,
293 http_username VARCHAR(20),
294 http_mime_type VARCHAR(50),
295 squid_hier_status VARCHAR(20),
296 squid_request_status VARCHAR(20)
297 );
298
299 =head1 VERSION INFORMATION
300
301 This document refers to C<log_db_daemon> script version 0.5.
302
303 The script has been developed and tested in the following environment:
304
305 =over 4
306
307 =item squid-2.7 Squid-3.2
308
309 =item mysql 5.0.26 and 5.1
310
311 =item perl 5.8.8
312
313 =item OpenSUSE 10.2
314
315 =back
316
317 =head1 DATA EXTRACTION
318
319 =head2 Sample queries.
320
321 =over 4
322
323 =item Clients accessing the cache
324
325 SELECT DISTINCT ip_client FROM access_log;
326
327 =item Number of request per day
328
329 SELECT
330 DATE(FROM_UNIXTIME(time_since_epoch)) AS date_day,
331 COUNT(*) AS num_of_requests
332 FROM access_log
333 GROUP BY 1
334 ORDER BY 1;
335
336 =item Request status count
337
338 To obtain the raw count of each request status:
339
340 SELECT squid_request_status, COUNT(*) AS n
341 FROM access_log
342 GROUP BY squid_request_status
343 ORDER BY 2 DESC;
344
345 To calculate the percentage of each request status:
346
347 SELECT
348 squid_request_status,
349 (COUNT(*)/(SELECT COUNT(*) FROM access_log)*100) AS percentage
350 FROM access_log
351 GROUP BY squid_request_status
352 ORDER BY 2 DESC;
353
354 To distinguish only between HITs and MISSes:
355
356 SELECT
357 'hits',
358 (SELECT COUNT(*)
359 FROM access_log
360 WHERE squid_request_status LIKE '%HIT%')
361 /
362 (SELECT COUNT(*) FROM access_log)*100
363 AS percentage
364 UNION
365 SELECT
366 'misses',
367 (SELECT COUNT(*)
368 FROM access_log
369 WHERE squid_request_status LIKE '%MISS%')
370 /
371 (SELECT COUNT(*) FROM access_log)*100
372 AS pecentage;
373
374 =item Response time ranges
375
376 SELECT
377 '0..500',
378 COUNT(*)/(SELECT COUNT(*) FROM access_log)*100 AS percentage
379 FROM access_log
380 WHERE time_response >= 0 AND time_response < 500
381 UNION
382 SELECT
383 '500..1000',
384 COUNT(*)/(SELECT COUNT(*) FROM access_log)*100 AS percentage
385 FROM access_log
386 WHERE time_response >= 500 AND time_response < 1000
387 UNION
388 SELECT
389 '1000..2000',
390 COUNT(*)/(SELECT COUNT(*) FROM access_log)*100 AS percentage
391 FROM access_log
392 WHERE time_response >= 1000 AND time_response < 2000
393 UNION
394 SELECT
395 '>= 2000',
396 COUNT(*)/(SELECT COUNT(*) FROM access_log)*100 AS percentage
397 FROM access_log
398 WHERE time_response >= 2000;
399
400 =item Traffic by mime type
401
402 SELECT
403 http_mime_type,
404 SUM(http_reply_size) as total_bytes
405 FROM access_log
406 GROUP BY http_mime_type
407 ORDER BY 2 DESC;
408
409 =item Traffic by client
410
411 SELECT
412 ip_client,
413 SUM(http_reply_size) AS total_bytes
414 FROM access_log
415 GROUP BY 1
416 ORDER BY 2 DESC;
417
418 =back
419
420 =head2 Speed issues
421
422 The MyISAM storage engine is known to be faster than the InnoDB one, so although it doesn't support transactions and referential integrity, it might be more appropriate in this scenario. You might want to append "ENGINE=MYISAM" at the end of the table creation code in the above SQL script.
423
424 Indexes should be created according to the queries that are more frequently run. The DDL script only creates an implicit index for the primary key column.
425
426 =head1 TODO
427
428 =head2 Table cleanup
429
430 This script currently implements only the C<L> (i.e. "append a line to the log") command, therefore the log lines are never purged from the table. This approach has an obvious scalability problem.
431
432 One solution would be to implement e.g. the "rotate log" command in a way that would calculate some summary values, put them in a "summary table" and then delete the lines used to caluclate those values.
433
434 Similar cleanup code could be implemented in an external script and run periodically independently from squid log commands.
435
436 =head2 Testing
437
438 This script has only been tested in low-volume scenarios (single client, less than 10 req/s). Tests in high volume environments could reveal performance bottlenecks and bugs.
439
440 =head1 AUTHOR
441
442 Marcello Romani, marcello.romani@libero.it
443 Amos Jeffries, amosjeffries@squid-cache.org
444
445 =head1 COPYRIGHT AND LICENSE
446
447 Copyright (C) 2008 by Marcello Romani
448
449 This library is free software; you can redistribute it and/or modify
450 it under the same terms as Perl itself, either Perl version 5.8.8 or,
451 at your option, any later version of Perl 5 you may have available.
452
453 =cut