]> git.ipfire.org Git - thirdparty/sarg.git/blob - readlog_extlog.c
Update the Russian translation
[thirdparty/sarg.git] / readlog_extlog.c
1 /*
2 * SARG Squid Analysis Report Generator http://sarg.sourceforge.net
3 * 1998, 2015
4 *
5 * SARG donations:
6 * please look at http://sarg.sourceforge.net/donations.php
7 * Support:
8 * http://sourceforge.net/projects/sarg/forums/forum/363374
9 * ---------------------------------------------------------------------
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
24 *
25 */
26
27 #include "include/conf.h"
28 #include "include/defs.h"
29 #include "include/readlog.h"
30
31 /*!
32 Maximum number of columns accepted in an extended log format.
33
34 The current value is an arbitrary number chosen to have an
35 actual limit.
36 */
37 #define MAX_EXT_COLUMNS 250
38
39 enum ext_col_id {
40 EXTCOL_Ip,
41 EXTCOL_UserName,
42 EXTCOL_Date,
43 EXTCOL_Time,
44 EXTCOL_TimeTaken,
45 EXTCOL_Bytes,
46 EXTCOL_Uri,
47 EXTCOL_Scheme,
48 EXTCOL_Host,
49 EXTCOL_Port,
50 EXTCOL_Path,
51 EXTCOL_Query,
52 EXTCOL_Status,
53 EXTCOL_Last //last entry of the list !
54 };
55
56 //! \c True if the extended common long format is confirmed.
57 static bool InExtLog=false;
58 //! The index of relevant columns in the log file.
59 static int ExtCols[EXTCOL_Last];
60 //! The character to use as a columns separator.
61 static char ExtColSep[MAX_EXT_COLUMNS];
62 //! The number of columns according to the "fields" directive.
63 static int ExtColNumber;
64 //! Temporary buffer to concatenate the url.
65 static char ExtTempUrl[MAX_URL_LEN];
66
67 /*!
68 A new file is being read. The name of the file is \a FileName.
69 */
70 static void ExtLog_NewFile(const char *FileName)
71 {
72 InExtLog=false;
73 ExtColNumber=0;
74 }
75
76 /*!
77 Parse the "Fields" directive listing the columns in the log. The
78 \a columns is a pointer to the first column of the directive.
79
80 \return \c True if the fields is valid or false if it could not
81 be decoded.
82 */
83 static bool ExtLog_Fields(const char *columns)
84 {
85 int col;
86 int len;
87 int prefix;
88 int header_start;
89 int header_end;
90 int i;
91 enum ext_col_id col_id;
92 char col_sep;
93 // see http://www.w3.org/TR/WD-logfile.html for the list of prefixes
94 const char const *prefixes[]=
95 {
96 "c",
97 "s",
98 "r",
99 "cs",
100 "sc",
101 "sr",
102 "rs",
103 "x",
104 };
105
106 for (i=0 ; i<EXTCOL_Last ; i++) ExtCols[i]=-1;
107
108 col=0;
109 while (*columns) {
110 if (col>=MAX_EXT_COLUMNS) {
111 debuga(__FILE__,__LINE__,_("Too many columns found in an extended log format. The maximum allowed is %d but it can be changed if a bigger value is legitimate\n"),MAX_EXT_COLUMNS);
112 exit(EXIT_FAILURE);
113 }
114 prefix=-1;
115 header_start=-1;
116 header_end=-1;
117 for (i=sizeof(prefixes)/sizeof(*prefixes)-1 ; i>=0 ; i--) {
118 len=strlen(prefixes[i]);
119 if (strncasecmp(columns,prefixes[i],len)==0) {
120 if (columns[len]=='-') {
121 prefix=len++;
122 break;
123 } else if (columns[len]=='(') {
124 header_start=len++;
125 break;
126 }
127 }
128 }
129 (void)prefix;//compiler pacifier
130 if (i<0) len=0;
131 for ( ; (unsigned char)columns[len]>' ' ; len++) {//skip a word and accept any separator (tab or space)
132 if (header_start>=0 && columns[len]==')') header_end=len;
133 }
134 (void)header_end;//compiler pacifier
135 col_sep=columns[len];
136 ExtColSep[col]=col_sep;
137
138 // see http://www.w3.org/TR/WD-logfile.html for list of possible identifiers
139 col_id=EXTCOL_Last;
140 if (len==4) {
141 if (strncasecmp(columns,"c-ip",len)==0 && ExtCols[EXTCOL_Ip]<0) col_id=EXTCOL_Ip;
142 else if (strncasecmp(columns,"date",len)==0) col_id=EXTCOL_Date;
143 else if (strncasecmp(columns,"time",len)==0) col_id=EXTCOL_Time;
144 } else if (len==5) {
145 if (strncasecmp(columns,"c-dns",len)==0) col_id=EXTCOL_Ip;
146 } else if (len==6) {
147 if (strncasecmp(columns,"cs-uri",len)==0) col_id=EXTCOL_Uri;
148 } else if (len==7) {
149 if (strncasecmp(columns,"cs-host",len)==0) col_id=EXTCOL_Host;
150 } else if (len==8) {
151 if (strncasecmp(columns,"sc-bytes",len)==0) col_id=EXTCOL_Bytes;
152 } else if (len==9) {
153 if (strncasecmp(columns,"sc-status",len)==0) col_id=EXTCOL_Status;
154 } else if (len==10) {
155 if (strncasecmp(columns,"time-taken",len)==0) col_id=EXTCOL_TimeTaken;
156 } else if (len==11) {
157 if (strncasecmp(columns,"cs-username",len)==0) col_id=EXTCOL_UserName;
158 if (strncasecmp(columns,"cs-uri-port",len)==0) col_id=EXTCOL_Port;
159 if (strncasecmp(columns,"cs-uri-path",len)==0) col_id=EXTCOL_Path;
160 } else if (len==12) {
161 if (strncasecmp(columns,"cs-uri-query",len)==0) col_id=EXTCOL_Query;
162 } else if (len==13) {
163 if (strncasecmp(columns,"cs-uri-scheme",len)==0) col_id=EXTCOL_Scheme;
164 }
165 if (col_id!=EXTCOL_Last) {
166 ExtCols[col_id]=col;
167 }
168
169 col++;
170 columns+=len;
171 while (*columns && (unsigned char)*columns<=' ') {
172 if (*columns!=col_sep) {
173 debuga(__FILE__,__LINE__,_("Multiple column separators found between two columns in the \"fields\" directive of an extended log format\n"));
174 exit(EXIT_FAILURE);
175 }
176 columns++;
177 }
178 }
179 ExtColNumber=col;
180 return(true);
181 }
182
183 /*!
184 Decode a directive field from the \a Line.
185
186 \return RLRC_Ignore if the line is a directive or RLRC_Unknown
187 if the line is not a known directive.
188 */
189 static enum ReadLogReturnCodeEnum ExtLog_Directive(const char *Line)
190 {
191 ++Line;
192 if (strncasecmp(Line,"Version:",8)==0) return(RLRC_Ignore);
193 if (strncasecmp(Line,"Software:",9)==0) return(RLRC_Ignore);
194 if (strncasecmp(Line,"Start-Date:",11)==0) return(RLRC_Ignore);
195 if (strncasecmp(Line,"End-Date:",9)==0) return(RLRC_Ignore);
196 if (strncasecmp(Line,"Date:",5)==0) return(RLRC_Ignore);
197 if (strncasecmp(Line,"Remark:",7)==0) return(RLRC_Ignore);
198 if (strncasecmp(Line,"Fields:",7)==0) {
199 Line+=7;
200 while (*Line==' ' || *Line=='\t') Line++;
201 if (!ExtLog_Fields(Line)) return(RLRC_Unknown);
202 return(RLRC_Ignore);
203 }
204 return(RLRC_Unknown);
205 }
206
207 /*!
208 Get the type of the column \a col_num.
209
210 \return The type of the column or EXTCOL_Last if
211 the column must be ignored.
212 */
213 static enum ext_col_id ExtLog_WhichColumn(int col_num)
214 {
215 int i;
216
217 for (i=0 ; i<EXTCOL_Last && ExtCols[i]!=col_num ; i++);
218 return(i);
219 }
220
221 /*!
222 Scan through the string of a column.
223
224 \param Line The pointer to the beginning of the string.
225 \param col The column number.
226 */
227 static char *ExtLog_GetString(char *Line,int col,char **End)
228 {
229 bool quote;
230 bool dequote;
231
232 //skip opening double quote
233 quote=(*Line=='\"');
234 if (quote) ++Line;
235
236 dequote=false;
237 while (*Line) {
238 if (quote) {
239 if (*Line=='\"') {
240 if (Line[1]!='\"') {
241 if (End) *End=(dequote) ? NULL : Line;
242 Line++;//skip the closing quote
243 quote=false;
244 break;
245 }
246 dequote=true;
247 }
248 } else {
249 if (*Line==ExtColSep[col]) {
250 if (End) *End=Line;
251 break;
252 }
253 }
254 Line++;
255 }
256 if (quote) return(NULL);//missing closing quote.
257 return(Line);
258 }
259
260 /*!
261 Scan through the date in a column.
262
263 \param Line The pointer to the beginning of the string.
264 */
265 static char *ExtLog_GetDate(char *Line,struct tm *Date)
266 {
267 bool quote;
268 int year;
269 int month;
270 int day;
271 int next;
272
273 //skip opening double quote
274 quote=(*Line=='\"');
275 if (quote) ++Line;
276 if (sscanf(Line,"%d-%d-%d%n",&year,&month,&day,&next)!=3) return(NULL);
277 Line+=next;
278 if (quote) {
279 if (*Line!='\"') return(NULL);//missing closing quote.
280 ++Line;
281 }
282 Date->tm_year=year-1900;
283 Date->tm_mon=month-1;
284 Date->tm_mday=day;
285 return(Line);
286 }
287
288 /*!
289 Scan through the time in a column.
290
291 \param Line The pointer to the beginning of the string.
292 */
293 static char *ExtLog_GetTime(char *Line,struct tm *Date)
294 {
295 bool quote;
296 int hour;
297 int minute;
298 int second;
299 int next;
300
301 //skip opening double quote
302 quote=(*Line=='\"');
303 if (quote) ++Line;
304 if (sscanf(Line,"%d:%d:%d%n",&hour,&minute,&second,&next)!=3) return(NULL);
305 Line+=next;
306 if (quote) {
307 if (*Line!='\"') return(NULL);//missing closing quote.
308 ++Line;
309 }
310 Date->tm_hour=hour;
311 Date->tm_min=minute;
312 Date->tm_sec=second;
313 return(Line);
314 }
315
316 /*!
317 Scan through a number in a column.
318
319 \param Line The pointer to the beginning of the string.
320 \param Value A variable to store the number.
321 */
322 static char *ExtLog_GetLongInt(char *Line,long int *Value)
323 {
324 bool quote;
325
326 //skip opening double quote
327 quote=(*Line=='\"');
328 if (quote) ++Line;
329 *Value=0;
330 while (isdigit(*Line)) *Value=*Value*10+(*Line++-'0');
331 if (quote) {
332 if (*Line!='\"') return(NULL);//missing closing quote.
333 ++Line;
334 }
335 return(Line);
336 }
337
338 /*!
339 Scan through a number in a column.
340
341 \param Line The pointer to the beginning of the string.
342 \param Value A variable to store the number.
343 */
344 static char *ExtLog_GetLongLongInt(char *Line,long long int *Value)
345 {
346 bool quote;
347
348 //skip opening double quote
349 quote=(*Line=='\"');
350 if (quote) ++Line;
351 *Value=0;
352 while (isdigit(*Line)) *Value=*Value*10+(*Line++-'0');
353 if (quote) {
354 if (*Line!='\"') return(NULL);//missing closing quote.
355 ++Line;
356 }
357 return(Line);
358 }
359
360 /*!
361 Remove the quotes inside the \a string. If no quotes are known to
362 be in the string, the \a end_ptr is the pointer to the last
363 character of the string.
364 */
365 static void ExtLog_FixString(char *string,char *end_ptr)
366 {
367 char *dest;
368
369 if (!string) return;//string not parsed
370 if (end_ptr) { //end is known and no quotes are in the string
371 *end_ptr='\0';
372 return;
373 }
374 // remove the quotes and end at the first unremoveable quote
375 dest=string;
376 while (*string)
377 {
378 if (*string=='\"') {
379 if (string[1]!='\"') break; //closing quote
380 string++;//skip the first quote
381 }
382 *dest++=*string++;
383 }
384 *dest='\0';
385 }
386
387 /*!
388 * Discard a empty string.
389 *
390 * An empty string may contain a single dash.
391 *
392 * \param String The string to check.
393 *
394 * \return The string pointer if it isn't empty or NULL if the string
395 * is empty.
396 */
397 static const char *ExtLog_FixEmptyString(const char *String)
398 {
399 if (String && (String[0]=='\0' || (String[0]=='-' && String[1]=='\0'))) String=NULL;
400 return(String);
401 }
402
403 /*!
404 * Create the URL from the split elements.
405 */
406 static char *ExtLog_ConcatUrl(const char *Scheme,const char *Host,const char *Port,const char *Path,const char *Query)
407 {
408 int tlen=0;
409 int len;
410
411 Scheme=ExtLog_FixEmptyString(Scheme);
412 Host=ExtLog_FixEmptyString(Host);
413 if (!Scheme && !Host)
414 {
415 /*
416 * Example of such an entry:
417 *
418 * #Fields:
419 * date time time-taken c-ip sc-status s-action sc-bytes cs-bytes cs-method cs-uri-scheme cs-host cs-uri-port cs-uri-path cs-uri-query cs-username cs-auth-group s-hierarchy s-supplier-name rs(Content-Type) cs(Referer) cs(User-Agent) sc-filter-result cs-categories x-virus-id s-ip
420 * 2015-07-29 06:05:50 30 192.168.1.21 400 TCP_NC_MISS 903 8163 unknown - - 0 / - userid - - 10.81.0.26 - - - DENIED "unavailable" - 10.81.0.26 - - ICAP_NOT_SCANNED - - -
421 *
422 * It looks like a failed connection attempt to an unavailable resource. Let's assume it is safe to ignore it.
423 */
424 ExtTempUrl[0]='\0';
425 return(ExtTempUrl);
426 }
427 Port=ExtLog_FixEmptyString(Port);
428 Path=ExtLog_FixEmptyString(Path);
429 Query=ExtLog_FixEmptyString(Query);
430
431 if (Scheme)
432 {
433 len=strlen(Scheme);
434 if (tlen+len+3>=sizeof(ExtTempUrl))
435 {
436 debuga(__FILE__,__LINE__,_("URI scheme too long in log file\n"));
437 exit(EXIT_FAILURE);
438 }
439 strcpy(ExtTempUrl,Scheme);
440 strcpy(ExtTempUrl+len,"://");
441 tlen+=len+3;
442 }
443
444 if (Host)
445 {
446 len=strlen(Host);
447 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
448 strncpy(ExtTempUrl+tlen,Host,len);
449 tlen+=len;
450 }
451
452 if (tlen+2<sizeof(ExtTempUrl) && Port)
453 {
454 len=strlen(Port);
455 if (tlen+len+1>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-2;
456 ExtTempUrl[tlen++]=':';
457 strncpy(ExtTempUrl+tlen,Port,len);
458 tlen+=len;
459 }
460
461 if (tlen<sizeof(ExtTempUrl) && Path)
462 {
463 len=strlen(Path);
464 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
465 strncpy(ExtTempUrl+tlen,Path,len);
466 tlen+=len;
467 }
468
469 if (tlen<sizeof(ExtTempUrl) && Query)
470 {
471 len=strlen(Query);
472 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
473 strncpy(ExtTempUrl+tlen,Query,len);
474 tlen+=len;
475 }
476 ExtTempUrl[tlen]='\0';
477 return(ExtTempUrl);
478 }
479
480 /*!
481 Read one entry from an extended log.
482
483 \param Line One line from the input log file.
484 \param Entry Where to store the information parsed from the line.
485
486 \retval RLRC_NoError One valid entry is parsed.
487 \retval RLRC_Unknown The line is invalid.
488 \retval RLRC_InternalError An internal error was encountered.
489 */
490 static enum ReadLogReturnCodeEnum ExtLog_ReadEntry(char *Line,struct ReadLogStruct *Entry)
491 {
492 int col;
493 enum ext_col_id col_id;
494 char *Ip=NULL;
495 char *IpEnd;
496 char *User=NULL;
497 char *UserEnd;
498 char *UrlEnd;
499 char *HttpCodeEnd;
500 char *UrlScheme=NULL,*UrlSchemeEnd;
501 char *UrlHost=NULL,*UrlHostEnd;
502 char *UrlPort=NULL,*UrlPortEnd;
503 char *UrlPath=NULL,*UrlPathEnd;
504 char *UrlQuery=NULL,*UrlQueryEnd;
505
506 // is it a directive
507 if (*Line=='#') {
508 enum ReadLogReturnCodeEnum status=ExtLog_Directive(Line);
509 if (status!=RLRC_Unknown) InExtLog=true;
510 return(status);
511 }
512 if (!InExtLog) return(RLRC_Unknown);
513
514 col=0;
515 while (*Line) {
516 if (col>=ExtColNumber) {
517 debuga(__FILE__,__LINE__,_("Too many columns in an extended log file format: %d columns found when %d have been announced\n"),col,ExtColNumber);
518 return(RLRC_Unknown);
519 }
520 col_id=ExtLog_WhichColumn(col);
521 switch (col_id)
522 {
523 case EXTCOL_Ip:
524 Entry->Ip=Ip=Line;
525 Line=ExtLog_GetString(Line,col,&IpEnd);
526 if (!Line) return(RLRC_Unknown);
527 break;
528 case EXTCOL_UserName:
529 Entry->User=User=Line;
530 Line=ExtLog_GetString(Line,col,&UserEnd);
531 if (!Line) return(RLRC_Unknown);
532 break;
533 case EXTCOL_Date:
534 Line=ExtLog_GetDate(Line,&Entry->EntryTime);
535 if (!Line) return(RLRC_Unknown);
536 break;
537 case EXTCOL_Time:
538 Line=ExtLog_GetTime(Line,&Entry->EntryTime);
539 if (!Line) return(RLRC_Unknown);
540 break;
541 case EXTCOL_TimeTaken:
542 Line=ExtLog_GetLongInt(Line,&Entry->ElapsedTime);
543 if (!Line) return(RLRC_Unknown);
544 break;
545 case EXTCOL_Bytes:
546 Line=ExtLog_GetLongLongInt(Line,&Entry->DataSize);
547 if (!Line) return(RLRC_Unknown);
548 break;
549 case EXTCOL_Uri:
550 Entry->Url=Line;
551 Line=ExtLog_GetString(Line,col,&UrlEnd);
552 if (!Line) return(RLRC_Unknown);
553 break;
554 case EXTCOL_Scheme:
555 UrlScheme=Line;
556 Line=ExtLog_GetString(Line,col,&UrlSchemeEnd);
557 if (!Line) return(RLRC_Unknown);
558 break;
559 case EXTCOL_Host:
560 UrlHost=Line;
561 Line=ExtLog_GetString(Line,col,&UrlHostEnd);
562 if (!Line) return(RLRC_Unknown);
563 break;
564 case EXTCOL_Port:
565 UrlPort=Line;
566 Line=ExtLog_GetString(Line,col,&UrlPortEnd);
567 if (!Line) return(RLRC_Unknown);
568 break;
569 case EXTCOL_Path:
570 UrlPath=Line;
571 Line=ExtLog_GetString(Line,col,&UrlPathEnd);
572 if (!Line) return(RLRC_Unknown);
573 break;
574 case EXTCOL_Query:
575 UrlQuery=Line;
576 Line=ExtLog_GetString(Line,col,&UrlQueryEnd);
577 if (!Line) return(RLRC_Unknown);
578 break;
579 case EXTCOL_Status:
580 Entry->HttpCode=Line;
581 Line=ExtLog_GetString(Line,col,&HttpCodeEnd);
582 if (!Line) return(RLRC_Unknown);
583 break;
584 case EXTCOL_Last://ignored column
585 Line=ExtLog_GetString(Line,col,NULL);
586 if (!Line) return(RLRC_Unknown);
587 break;
588 }
589 if (*Line && *Line!=ExtColSep[col]) return(RLRC_Unknown);
590 while (*Line && *Line==ExtColSep[col]) Line++;
591 col++;
592 }
593 if (col!=ExtColNumber) {
594 debuga(__FILE__,__LINE__,_("Only %d columns in an extended log file format when %d have been announced\n"),col,ExtColNumber);
595 return(RLRC_Unknown);
596 }
597
598 // check the entry time
599 if (mktime(&Entry->EntryTime)==-1) {
600 debuga(__FILE__,__LINE__,_("Invalid date or time found in the extended log file\n"));
601 return(RLRC_InternalError);
602 }
603
604 ExtLog_FixString(Ip,IpEnd);
605 ExtLog_FixString(User,UserEnd);
606 ExtLog_FixString(Entry->Url,UrlEnd);
607 ExtLog_FixString(Entry->HttpCode,HttpCodeEnd);
608 if (!Entry->Url)
609 {
610 ExtLog_FixString(UrlScheme,UrlSchemeEnd);
611 ExtLog_FixString(UrlHost,UrlHostEnd);
612 ExtLog_FixString(UrlPort,UrlPortEnd);
613 ExtLog_FixString(UrlPath,UrlPathEnd);
614 ExtLog_FixString(UrlQuery,UrlQueryEnd);
615 Entry->Url=ExtLog_ConcatUrl(UrlScheme,UrlHost,UrlPort,UrlPath,UrlQuery);
616 }
617
618 return(RLRC_NoError);
619 }
620
621 //! \brief Object to read an extended log.
622 const struct ReadLogProcessStruct ReadExtLog=
623 {
624 /* TRANSLATORS: This is the name of the log format displayed when this format is detected in an input log file. */
625 N_("extended log format"),
626 ExtLog_NewFile,
627 ExtLog_ReadEntry
628 };