]> git.ipfire.org Git - thirdparty/sarg.git/blob - readlog_extlog.c
Update the man page
[thirdparty/sarg.git] / readlog_extlog.c
1 /*
2 * SARG Squid Analysis Report Generator http://sarg.sourceforge.net
3 * 1998, 2015
4 *
5 * SARG donations:
6 * please look at http://sarg.sourceforge.net/donations.php
7 * Support:
8 * http://sourceforge.net/projects/sarg/forums/forum/363374
9 * ---------------------------------------------------------------------
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
24 *
25 */
26
27 #include "include/conf.h"
28 #include "include/defs.h"
29 #include "include/readlog.h"
30
31 /*!
32 Maximum number of columns accepted in an extended log format.
33
34 The current value is an arbitrary number chosen to have an
35 actual limit.
36 */
37 #define MAX_EXT_COLUMNS 250
38
39 enum ext_col_id {
40 EXTCOL_Ip,
41 EXTCOL_UserName,
42 EXTCOL_Date,
43 EXTCOL_Time,
44 EXTCOL_TimeTaken,
45 EXTCOL_Bytes,
46 EXTCOL_Uri,
47 EXTCOL_Scheme,
48 EXTCOL_Host,
49 EXTCOL_Port,
50 EXTCOL_Path,
51 EXTCOL_Query,
52 EXTCOL_Status,
53 EXTCOL_UserAgent,
54 EXTCOL_Last //last entry of the list !
55 };
56
57 //! \c True if the extended common long format is confirmed.
58 static bool InExtLog=false;
59 //! The index of relevant columns in the log file.
60 static int ExtCols[EXTCOL_Last];
61 //! The character to use as a columns separator.
62 static char ExtColSep[MAX_EXT_COLUMNS];
63 //! The number of columns according to the "fields" directive.
64 static int ExtColNumber;
65 //! Temporary buffer to concatenate the url.
66 static char ExtTempUrl[MAX_URL_LEN];
67
68 /*!
69 A new file is being read. The name of the file is \a FileName.
70 */
71 static void ExtLog_NewFile(const char *FileName)
72 {
73 InExtLog=false;
74 ExtColNumber=0;
75 }
76
77 /*!
78 Parse the "Fields" directive listing the columns in the log. The
79 \a columns is a pointer to the first column of the directive.
80
81 \return \c True if the fields is valid or false if it could not
82 be decoded.
83 */
84 static bool ExtLog_Fields(const char *columns)
85 {
86 int col;
87 int len;
88 int prefix;
89 int header_start;
90 int header_end;
91 int i;
92 enum ext_col_id col_id;
93 char col_sep;
94 // see http://www.w3.org/TR/WD-logfile.html for the list of prefixes
95 const char const *prefixes[]=
96 {
97 "c",
98 "s",
99 "r",
100 "cs",
101 "sc",
102 "sr",
103 "rs",
104 "x",
105 };
106
107 for (i=0 ; i<EXTCOL_Last ; i++) ExtCols[i]=-1;
108
109 col=0;
110 while (*columns) {
111 if (col>=MAX_EXT_COLUMNS) {
112 debuga(__FILE__,__LINE__,_("Too many columns found in an extended log format. The maximum allowed is %d but it can be changed if a bigger value is legitimate\n"),MAX_EXT_COLUMNS);
113 exit(EXIT_FAILURE);
114 }
115 prefix=-1;
116 header_start=-1;
117 header_end=-1;
118 for (i=sizeof(prefixes)/sizeof(*prefixes)-1 ; i>=0 ; i--) {
119 len=strlen(prefixes[i]);
120 if (strncasecmp(columns,prefixes[i],len)==0) {
121 if (columns[len]=='-') {
122 prefix=len++;
123 break;
124 } else if (columns[len]=='(') {
125 header_start=len++;
126 break;
127 }
128 }
129 }
130 (void)prefix;//compiler pacifier
131 if (i<0) len=0;
132 for ( ; (unsigned char)columns[len]>' ' ; len++) {//skip a word and accept any separator (tab or space)
133 if (header_start>=0 && columns[len]==')') header_end=len;
134 }
135 (void)header_end;//compiler pacifier
136 col_sep=columns[len];
137 ExtColSep[col]=col_sep;
138
139 // see http://www.w3.org/TR/WD-logfile.html for list of possible identifiers
140 col_id=EXTCOL_Last;
141 if (len==4) {
142 if (strncasecmp(columns,"c-ip",len)==0 && ExtCols[EXTCOL_Ip]<0) col_id=EXTCOL_Ip;
143 else if (strncasecmp(columns,"date",len)==0) col_id=EXTCOL_Date;
144 else if (strncasecmp(columns,"time",len)==0) col_id=EXTCOL_Time;
145 } else if (len==5) {
146 if (strncasecmp(columns,"c-dns",len)==0) col_id=EXTCOL_Ip;
147 } else if (len==6) {
148 if (strncasecmp(columns,"cs-uri",len)==0) col_id=EXTCOL_Uri;
149 } else if (len==7) {
150 if (strncasecmp(columns,"cs-host",len)==0) col_id=EXTCOL_Host;
151 } else if (len==8) {
152 if (strncasecmp(columns,"sc-bytes",len)==0) col_id=EXTCOL_Bytes;
153 } else if (len==9) {
154 if (strncasecmp(columns,"sc-status",len)==0) col_id=EXTCOL_Status;
155 } else if (len==10) {
156 if (strncasecmp(columns,"time-taken",len)==0) col_id=EXTCOL_TimeTaken;
157 } else if (len==11) {
158 if (strncasecmp(columns,"cs-username",len)==0) col_id=EXTCOL_UserName;
159 if (strncasecmp(columns,"cs-uri-port",len)==0) col_id=EXTCOL_Port;
160 if (strncasecmp(columns,"cs-uri-path",len)==0) col_id=EXTCOL_Path;
161 } else if (len==12) {
162 if (strncasecmp(columns,"cs-uri-query",len)==0) col_id=EXTCOL_Query;
163 } else if (len==13) {
164 if (strncasecmp(columns,"cs-uri-scheme",len)==0) col_id=EXTCOL_Scheme;
165 } else if (len==14) {
166 if (strncasecmp(columns,"cs(User-Agent)",len)==0) col_id=EXTCOL_UserAgent;
167 }
168 if (col_id!=EXTCOL_Last) {
169 ExtCols[col_id]=col;
170 }
171
172 col++;
173 columns+=len;
174 while (*columns && (unsigned char)*columns<=' ') {
175 if (*columns!=col_sep) {
176 debuga(__FILE__,__LINE__,_("Multiple column separators found between two columns in the \"fields\" directive of an extended log format\n"));
177 exit(EXIT_FAILURE);
178 }
179 columns++;
180 }
181 }
182 ExtColNumber=col;
183 return(true);
184 }
185
186 /*!
187 Decode a directive field from the \a Line.
188
189 \return RLRC_Ignore if the line is a directive or RLRC_Unknown
190 if the line is not a known directive.
191 */
192 static enum ReadLogReturnCodeEnum ExtLog_Directive(const char *Line)
193 {
194 ++Line;
195 if (strncasecmp(Line,"Version:",8)==0) return(RLRC_Ignore);
196 if (strncasecmp(Line,"Software:",9)==0) return(RLRC_Ignore);
197 if (strncasecmp(Line,"Start-Date:",11)==0) return(RLRC_Ignore);
198 if (strncasecmp(Line,"End-Date:",9)==0) return(RLRC_Ignore);
199 if (strncasecmp(Line,"Date:",5)==0) return(RLRC_Ignore);
200 if (strncasecmp(Line,"Remark:",7)==0) return(RLRC_Ignore);
201 if (strncasecmp(Line,"Fields:",7)==0) {
202 Line+=7;
203 while (*Line==' ' || *Line=='\t') Line++;
204 if (!ExtLog_Fields(Line)) return(RLRC_Unknown);
205 return(RLRC_Ignore);
206 }
207 return(RLRC_Unknown);
208 }
209
210 /*!
211 Get the type of the column \a col_num.
212
213 \return The type of the column or EXTCOL_Last if
214 the column must be ignored.
215 */
216 static enum ext_col_id ExtLog_WhichColumn(int col_num)
217 {
218 int i;
219
220 for (i=0 ; i<EXTCOL_Last && ExtCols[i]!=col_num ; i++);
221 return(i);
222 }
223
224 /*!
225 Scan through the string of a column.
226
227 \param Line The pointer to the beginning of the string.
228 \param col The column number.
229 */
230 static char *ExtLog_GetString(char *Line,int col,char **End)
231 {
232 bool quote;
233
234 //skip opening double quote
235 quote=(*Line=='\"');
236 if (quote) ++Line;
237
238 while (*Line) {
239 if (quote) {
240 if (*Line=='\"') {
241 if (Line[1]=='\"') {
242 Line++;//skip the first quote here, the second is skipped by the other Line++
243 } else {
244 if (End) *End=Line;
245 Line++;//skip closing quote
246 quote=false;
247 break;
248 }
249 }
250 } else {
251 if (*Line==ExtColSep[col]) {
252 if (End) *End=Line;
253 break;
254 }
255 }
256 Line++;
257 }
258 if (quote) return(NULL);//missing closing quote.
259 return(Line);
260 }
261
262 /*!
263 Scan through the date in a column.
264
265 \param Line The pointer to the beginning of the string.
266 */
267 static char *ExtLog_GetDate(char *Line,struct tm *Date)
268 {
269 bool quote;
270 int year;
271 int month;
272 int day;
273 int next;
274
275 //skip opening double quote
276 quote=(*Line=='\"');
277 if (quote) ++Line;
278 if (sscanf(Line,"%d-%d-%d%n",&year,&month,&day,&next)!=3) return(NULL);
279 Line+=next;
280 if (quote) {
281 if (*Line!='\"') return(NULL);//missing closing quote.
282 ++Line;
283 }
284 Date->tm_year=year-1900;
285 Date->tm_mon=month-1;
286 Date->tm_mday=day;
287 return(Line);
288 }
289
290 /*!
291 Scan through the time in a column.
292
293 \param Line The pointer to the beginning of the string.
294 */
295 static char *ExtLog_GetTime(char *Line,struct tm *Date)
296 {
297 bool quote;
298 int hour;
299 int minute;
300 int second;
301 int next;
302
303 //skip opening double quote
304 quote=(*Line=='\"');
305 if (quote) ++Line;
306 if (sscanf(Line,"%d:%d:%d%n",&hour,&minute,&second,&next)!=3) return(NULL);
307 Line+=next;
308 if (quote) {
309 if (*Line!='\"') return(NULL);//missing closing quote.
310 ++Line;
311 }
312 Date->tm_hour=hour;
313 Date->tm_min=minute;
314 Date->tm_sec=second;
315 return(Line);
316 }
317
318 /*!
319 Scan through a number in a column.
320
321 \param Line The pointer to the beginning of the string.
322 \param Value A variable to store the number.
323 */
324 static char *ExtLog_GetLongInt(char *Line,long int *Value)
325 {
326 bool quote;
327
328 //skip opening double quote
329 quote=(*Line=='\"');
330 if (quote) ++Line;
331 *Value=0;
332 while (isdigit(*Line)) *Value=*Value*10+(*Line++-'0');
333 if (quote) {
334 if (*Line!='\"') return(NULL);//missing closing quote.
335 ++Line;
336 }
337 return(Line);
338 }
339
340 /*!
341 Scan through a number in a column.
342
343 \param Line The pointer to the beginning of the string.
344 \param Value A variable to store the number.
345 */
346 static char *ExtLog_GetLongLongInt(char *Line,long long int *Value)
347 {
348 bool quote;
349
350 //skip opening double quote
351 quote=(*Line=='\"');
352 if (quote) ++Line;
353 *Value=0;
354 while (isdigit(*Line)) *Value=*Value*10+(*Line++-'0');
355 if (quote) {
356 if (*Line!='\"') return(NULL);//missing closing quote.
357 ++Line;
358 }
359 return(Line);
360 }
361
362 /*!
363 Remove the quotes inside the \a string. If no quotes are known to
364 be in the string, the \a end_ptr is the pointer to the last
365 character of the string.
366 */
367 static void ExtLog_FixString(char *string,char *end_ptr)
368 {
369 char *dest;
370
371 if (!string) return;//string not parsed
372 if (*string!='\"' && end_ptr) { //no quotes to remove from the string
373 *end_ptr='\0';
374 return;
375 }
376
377 // remove first quote
378 dest=string;
379 if (string[1]!='\"') string++;
380
381 // remove the quotes and end at the first unremoveable quote
382 while (*string)
383 {
384 if (*string=='\"') {
385 if (string[1]!='\"') break; //closing quote
386 string++;//skip the first quote
387 }
388 *dest++=*string++;
389 }
390 *dest='\0';
391 }
392
393 /*!
394 * Discard a empty string.
395 *
396 * An empty string may contain a single dash.
397 *
398 * \param String The string to check.
399 *
400 * \return The string pointer if it isn't empty or NULL if the string
401 * is empty.
402 */
403 static const char *ExtLog_FixEmptyString(const char *String)
404 {
405 if (String && (String[0]=='\0' || (String[0]=='-' && String[1]=='\0'))) String=NULL;
406 return(String);
407 }
408
409 /*!
410 * Create the URL from the split elements.
411 */
412 static char *ExtLog_ConcatUrl(const char *Scheme,const char *Host,const char *Port,const char *Path,const char *Query)
413 {
414 int tlen=0;
415 int len;
416
417 Scheme=ExtLog_FixEmptyString(Scheme);
418 Host=ExtLog_FixEmptyString(Host);
419 if (!Scheme && !Host)
420 {
421 /*
422 * Example of such an entry:
423 *
424 * #Fields:
425 * date time time-taken c-ip sc-status s-action sc-bytes cs-bytes cs-method cs-uri-scheme cs-host cs-uri-port cs-uri-path cs-uri-query cs-username cs-auth-group s-hierarchy s-supplier-name rs(Content-Type) cs(Referer) cs(User-Agent) sc-filter-result cs-categories x-virus-id s-ip
426 * 2015-07-29 06:05:50 30 192.168.1.21 400 TCP_NC_MISS 903 8163 unknown - - 0 / - userid - - 10.81.0.26 - - - DENIED "unavailable" - 10.81.0.26 - - ICAP_NOT_SCANNED - - -
427 *
428 * It looks like a failed connection attempt to an unavailable resource. Let's assume it is safe to ignore it.
429 */
430 ExtTempUrl[0]='\0';
431 return(ExtTempUrl);
432 }
433 Port=ExtLog_FixEmptyString(Port);
434 Path=ExtLog_FixEmptyString(Path);
435 Query=ExtLog_FixEmptyString(Query);
436
437 if (Scheme)
438 {
439 len=strlen(Scheme);
440 if (tlen+len+3>=sizeof(ExtTempUrl))
441 {
442 debuga(__FILE__,__LINE__,_("URI scheme too long in log file\n"));
443 exit(EXIT_FAILURE);
444 }
445 strcpy(ExtTempUrl,Scheme);
446 strcpy(ExtTempUrl+len,"://");
447 tlen+=len+3;
448 }
449
450 if (Host)
451 {
452 len=strlen(Host);
453 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
454 strncpy(ExtTempUrl+tlen,Host,len);
455 tlen+=len;
456 }
457
458 if (tlen+2<sizeof(ExtTempUrl) && Port)
459 {
460 len=strlen(Port);
461 if (tlen+len+1>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-2;
462 ExtTempUrl[tlen++]=':';
463 strncpy(ExtTempUrl+tlen,Port,len);
464 tlen+=len;
465 }
466
467 if (tlen<sizeof(ExtTempUrl) && Path)
468 {
469 len=strlen(Path);
470 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
471 strncpy(ExtTempUrl+tlen,Path,len);
472 tlen+=len;
473 }
474
475 if (tlen<sizeof(ExtTempUrl) && Query)
476 {
477 len=strlen(Query);
478 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
479 strncpy(ExtTempUrl+tlen,Query,len);
480 tlen+=len;
481 }
482 ExtTempUrl[tlen]='\0';
483 return(ExtTempUrl);
484 }
485
486 /*!
487 Read one entry from an extended log.
488
489 \param Line One line from the input log file.
490 \param Entry Where to store the information parsed from the line.
491
492 \retval RLRC_NoError One valid entry is parsed.
493 \retval RLRC_Unknown The line is invalid.
494 \retval RLRC_InternalError An internal error was encountered.
495 */
496 static enum ReadLogReturnCodeEnum ExtLog_ReadEntry(char *Line,struct ReadLogStruct *Entry)
497 {
498 int col;
499 enum ext_col_id col_id;
500 char *Ip=NULL;
501 char *IpEnd;
502 char *User=NULL;
503 char *UserEnd;
504 char *UrlEnd;
505 char *HttpCodeEnd;
506 char *UrlScheme=NULL,*UrlSchemeEnd;
507 char *UrlHost=NULL,*UrlHostEnd;
508 char *UrlPort=NULL,*UrlPortEnd;
509 char *UrlPath=NULL,*UrlPathEnd;
510 char *UrlQuery=NULL,*UrlQueryEnd;
511 char *UserAgent=NULL,*UserAgentEnd;
512
513 // is it a directive
514 if (*Line=='#') {
515 enum ReadLogReturnCodeEnum status=ExtLog_Directive(Line);
516 if (status!=RLRC_Unknown) InExtLog=true;
517 return(status);
518 }
519 if (!InExtLog) return(RLRC_Unknown);
520
521 col=0;
522 while (*Line) {
523 if (col>=ExtColNumber) {
524 debuga(__FILE__,__LINE__,_("Too many columns in an extended log file format: %d columns found when %d have been announced\n"),col,ExtColNumber);
525 return(RLRC_Unknown);
526 }
527 col_id=ExtLog_WhichColumn(col);
528 switch (col_id)
529 {
530 case EXTCOL_Ip:
531 Entry->Ip=Ip=Line;
532 Line=ExtLog_GetString(Line,col,&IpEnd);
533 if (!Line) return(RLRC_Unknown);
534 break;
535 case EXTCOL_UserName:
536 Entry->User=User=Line;
537 Line=ExtLog_GetString(Line,col,&UserEnd);
538 if (!Line) return(RLRC_Unknown);
539 break;
540 case EXTCOL_Date:
541 Line=ExtLog_GetDate(Line,&Entry->EntryTime);
542 if (!Line) return(RLRC_Unknown);
543 break;
544 case EXTCOL_Time:
545 Line=ExtLog_GetTime(Line,&Entry->EntryTime);
546 if (!Line) return(RLRC_Unknown);
547 break;
548 case EXTCOL_TimeTaken:
549 Line=ExtLog_GetLongInt(Line,&Entry->ElapsedTime);
550 if (!Line) return(RLRC_Unknown);
551 break;
552 case EXTCOL_Bytes:
553 Line=ExtLog_GetLongLongInt(Line,&Entry->DataSize);
554 if (!Line) return(RLRC_Unknown);
555 break;
556 case EXTCOL_Uri:
557 Entry->Url=Line;
558 Line=ExtLog_GetString(Line,col,&UrlEnd);
559 if (!Line) return(RLRC_Unknown);
560 break;
561 case EXTCOL_Scheme:
562 UrlScheme=Line;
563 Line=ExtLog_GetString(Line,col,&UrlSchemeEnd);
564 if (!Line) return(RLRC_Unknown);
565 break;
566 case EXTCOL_Host:
567 UrlHost=Line;
568 Line=ExtLog_GetString(Line,col,&UrlHostEnd);
569 if (!Line) return(RLRC_Unknown);
570 break;
571 case EXTCOL_Port:
572 UrlPort=Line;
573 Line=ExtLog_GetString(Line,col,&UrlPortEnd);
574 if (!Line) return(RLRC_Unknown);
575 break;
576 case EXTCOL_Path:
577 UrlPath=Line;
578 Line=ExtLog_GetString(Line,col,&UrlPathEnd);
579 if (!Line) return(RLRC_Unknown);
580 break;
581 case EXTCOL_Query:
582 UrlQuery=Line;
583 Line=ExtLog_GetString(Line,col,&UrlQueryEnd);
584 if (!Line) return(RLRC_Unknown);
585 break;
586 case EXTCOL_Status:
587 Entry->HttpCode=Line;
588 Line=ExtLog_GetString(Line,col,&HttpCodeEnd);
589 if (!Line) return(RLRC_Unknown);
590 break;
591 case EXTCOL_UserAgent:
592 UserAgent=Line;
593 Line=ExtLog_GetString(Line,col,&UserAgentEnd);
594 if (!Line) return(RLRC_Unknown);
595 break;
596 case EXTCOL_Last://ignored column
597 Line=ExtLog_GetString(Line,col,NULL);
598 if (!Line) return(RLRC_Unknown);
599 break;
600 }
601 if (*Line && *Line!=ExtColSep[col]) return(RLRC_Unknown);
602 while (*Line && *Line==ExtColSep[col]) Line++;
603 col++;
604 }
605 if (col!=ExtColNumber) {
606 debuga(__FILE__,__LINE__,_("Only %d columns in an extended log file format when %d have been announced\n"),col,ExtColNumber);
607 return(RLRC_Unknown);
608 }
609
610 // check the entry time
611 if (mktime(&Entry->EntryTime)==-1) {
612 debuga(__FILE__,__LINE__,_("Invalid date or time found in the extended log file\n"));
613 return(RLRC_InternalError);
614 }
615
616 ExtLog_FixString(Ip,IpEnd);
617 ExtLog_FixString(User,UserEnd);
618 ExtLog_FixString(Entry->Url,UrlEnd);
619 ExtLog_FixString(Entry->HttpCode,HttpCodeEnd);
620 if (!Entry->Url)
621 {
622 ExtLog_FixString(UrlScheme,UrlSchemeEnd);
623 ExtLog_FixString(UrlHost,UrlHostEnd);
624 ExtLog_FixString(UrlPort,UrlPortEnd);
625 ExtLog_FixString(UrlPath,UrlPathEnd);
626 ExtLog_FixString(UrlQuery,UrlQueryEnd);
627 Entry->Url=ExtLog_ConcatUrl(UrlScheme,UrlHost,UrlPort,UrlPath,UrlQuery);
628 }
629 ExtLog_FixString(UserAgent,UserAgentEnd);
630 Entry->UserAgent=ExtLog_FixEmptyString(UserAgent);
631
632 return(RLRC_NoError);
633 }
634
635 //! \brief Object to read an extended log.
636 const struct ReadLogProcessStruct ReadExtLog=
637 {
638 /* TRANSLATORS: This is the name of the log format displayed when this format is detected in an input log file. */
639 N_("extended log format"),
640 ExtLog_NewFile,
641 ExtLog_ReadEntry
642 };