]> git.ipfire.org Git - thirdparty/sarg.git/blob - readlog_extlog.c
Read an extended log even if cs-uri is split over several columns
[thirdparty/sarg.git] / readlog_extlog.c
1 /*
2 * SARG Squid Analysis Report Generator http://sarg.sourceforge.net
3 * 1998, 2015
4 *
5 * SARG donations:
6 * please look at http://sarg.sourceforge.net/donations.php
7 * Support:
8 * http://sourceforge.net/projects/sarg/forums/forum/363374
9 * ---------------------------------------------------------------------
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
24 *
25 */
26
27 #include "include/conf.h"
28 #include "include/defs.h"
29 #include "include/readlog.h"
30
31 /*!
32 Maximum number of columns accepted in an extended log format.
33
34 The current value is an arbitrary number chosen to have an
35 actual limit.
36 */
37 #define MAX_EXT_COLUMNS 250
38
39 enum ext_col_id {
40 EXTCOL_Ip,
41 EXTCOL_UserName,
42 EXTCOL_Date,
43 EXTCOL_Time,
44 EXTCOL_TimeTaken,
45 EXTCOL_Bytes,
46 EXTCOL_Uri,
47 EXTCOL_Scheme,
48 EXTCOL_Host,
49 EXTCOL_Port,
50 EXTCOL_Path,
51 EXTCOL_Query,
52 EXTCOL_Status,
53 EXTCOL_Last //last entry of the list !
54 };
55
56 //! \c True if the extended common long format is confirmed.
57 static bool InExtLog=false;
58 //! The index of relevant columns in the log file.
59 static int ExtCols[EXTCOL_Last];
60 //! The character to use as a columns separator.
61 static char ExtColSep[MAX_EXT_COLUMNS];
62 //! The number of columns according to the "fields" directive.
63 static int ExtColNumber;
64 //! Temporary buffer to concatenate the url.
65 static char ExtTempUrl[MAX_URL_LEN];
66
67 /*!
68 A new file is being read. The name of the file is \a FileName.
69 */
70 static void ExtLog_NewFile(const char *FileName)
71 {
72 InExtLog=false;
73 ExtColNumber=0;
74 }
75
76 /*!
77 Parse the "Fields" directive listing the columns in the log. The
78 \a columns is a pointer to the first column of the directive.
79
80 \return \c True if the fields is valid or false if it could not
81 be decoded.
82 */
83 static bool ExtLog_Fields(const char *columns)
84 {
85 int col;
86 int len;
87 int prefix;
88 int header_start;
89 int header_end;
90 int i;
91 enum ext_col_id col_id;
92 char col_sep;
93 // see http://www.w3.org/TR/WD-logfile.html for the list of prefixes
94 const char const *prefixes[]=
95 {
96 "c",
97 "s",
98 "r",
99 "cs",
100 "sc",
101 "sr",
102 "rs",
103 "x",
104 };
105
106 for (i=0 ; i<EXTCOL_Last ; i++) ExtCols[i]=-1;
107
108 col=0;
109 while (*columns) {
110 if (col>=MAX_EXT_COLUMNS) {
111 debuga(__FILE__,__LINE__,_("Too many columns found in an extended log format. The maximum allowed is %d but it can be changed if a bigger value is legitimate\n"),MAX_EXT_COLUMNS);
112 exit(EXIT_FAILURE);
113 }
114 prefix=-1;
115 header_start=-1;
116 header_end=-1;
117 for (i=sizeof(prefixes)/sizeof(*prefixes)-1 ; i>=0 ; i--) {
118 len=strlen(prefixes[i]);
119 if (strncasecmp(columns,prefixes[i],len)==0) {
120 if (columns[len]=='-') {
121 prefix=len++;
122 break;
123 } else if (columns[len]=='(') {
124 header_start=len++;
125 break;
126 }
127 }
128 }
129 (void)prefix;//compiler pacifier
130 if (i<0) len=0;
131 for ( ; (unsigned char)columns[len]>' ' ; len++) {//skip a word and accept any separator (tab or space)
132 if (header_start>=0 && columns[len]==')') header_end=len;
133 }
134 (void)header_end;//compiler pacifier
135 col_sep=columns[len];
136 ExtColSep[col]=col_sep;
137
138 // see http://www.w3.org/TR/WD-logfile.html for list of possible identifiers
139 col_id=EXTCOL_Last;
140 if (len==4) {
141 if (strncasecmp(columns,"c-ip",len)==0 && ExtCols[EXTCOL_Ip]<0) col_id=EXTCOL_Ip;
142 else if (strncasecmp(columns,"date",len)==0) col_id=EXTCOL_Date;
143 else if (strncasecmp(columns,"time",len)==0) col_id=EXTCOL_Time;
144 } else if (len==5) {
145 if (strncasecmp(columns,"c-dns",len)==0) col_id=EXTCOL_Ip;
146 } else if (len==6) {
147 if (strncasecmp(columns,"cs-uri",len)==0) col_id=EXTCOL_Uri;
148 } else if (len==7) {
149 if (strncasecmp(columns,"cs-host",len)==0) col_id=EXTCOL_Host;
150 } else if (len==8) {
151 if (strncasecmp(columns,"sc-bytes",len)==0) col_id=EXTCOL_Bytes;
152 } else if (len==9) {
153 if (strncasecmp(columns,"sc-status",len)==0) col_id=EXTCOL_Status;
154 } else if (len==10) {
155 if (strncasecmp(columns,"time-taken",len)==0) col_id=EXTCOL_TimeTaken;
156 } else if (len==11) {
157 if (strncasecmp(columns,"cs-username",len)==0) col_id=EXTCOL_UserName;
158 if (strncasecmp(columns,"cs-uri-port",len)==0) col_id=EXTCOL_Port;
159 if (strncasecmp(columns,"cs-uri-path",len)==0) col_id=EXTCOL_Path;
160 } else if (len==12) {
161 if (strncasecmp(columns,"cs-uri-query",len)==0) col_id=EXTCOL_Query;
162 } else if (len==13) {
163 if (strncasecmp(columns,"cs-uri-scheme",len)==0) col_id=EXTCOL_Scheme;
164 }
165 if (col_id!=EXTCOL_Last) {
166 ExtCols[col_id]=col;
167 }
168
169 col++;
170 columns+=len;
171 while (*columns && (unsigned char)*columns<=' ') {
172 if (*columns!=col_sep) {
173 debuga(__FILE__,__LINE__,_("Multiple column separators found between two columns in the \"fields\" directive of an extended log format\n"));
174 exit(EXIT_FAILURE);
175 }
176 columns++;
177 }
178 }
179 ExtColNumber=col;
180 return(true);
181 }
182
183 /*!
184 Decode a directive field from the \a Line.
185
186 \return RLRC_Ignore if the line is a directive or RLRC_Unknown
187 if the line is not a known directive.
188 */
189 static enum ReadLogReturnCodeEnum ExtLog_Directive(const char *Line)
190 {
191 ++Line;
192 if (strncasecmp(Line,"Version:",8)==0) return(RLRC_Ignore);
193 if (strncasecmp(Line,"Software:",9)==0) return(RLRC_Ignore);
194 if (strncasecmp(Line,"Start-Date:",11)==0) return(RLRC_Ignore);
195 if (strncasecmp(Line,"End-Date:",9)==0) return(RLRC_Ignore);
196 if (strncasecmp(Line,"Date:",5)==0) return(RLRC_Ignore);
197 if (strncasecmp(Line,"Remark:",7)==0) return(RLRC_Ignore);
198 if (strncasecmp(Line,"Fields:",7)==0) {
199 Line+=7;
200 while (*Line==' ' || *Line=='\t') Line++;
201 if (!ExtLog_Fields(Line)) return(RLRC_Unknown);
202 return(RLRC_Ignore);
203 }
204 return(RLRC_Unknown);
205 }
206
207 /*!
208 Get the type of the column \a col_num.
209
210 \return The type of the column or EXTCOL_Last if
211 the column must be ignored.
212 */
213 static enum ext_col_id ExtLog_WhichColumn(int col_num)
214 {
215 int i;
216
217 for (i=0 ; i<EXTCOL_Last && ExtCols[i]!=col_num ; i++);
218 return(i);
219 }
220
221 /*!
222 Scan through the string of a column.
223
224 \param Line The pointer to the beginning of the string.
225 \param col The column number.
226 */
227 static char *ExtLog_GetString(char *Line,int col,char **End)
228 {
229 bool quote;
230 bool dequote;
231
232 //skip opening double quote
233 quote=(*Line=='\"');
234 if (quote) ++Line;
235
236 dequote=false;
237 while (*Line) {
238 if (quote) {
239 if (*Line=='\"') {
240 if (Line[1]!='\"') {
241 if (End) *End=(dequote) ? NULL : Line;
242 Line++;//skip the closing quote
243 quote=false;
244 break;
245 }
246 dequote=true;
247 }
248 } else {
249 if (*Line==ExtColSep[col]) {
250 if (End) *End=Line;
251 break;
252 }
253 }
254 Line++;
255 }
256 if (quote) return(NULL);//missing closing quote.
257 return(Line);
258 }
259
260 /*!
261 Scan through the date in a column.
262
263 \param Line The pointer to the beginning of the string.
264 */
265 static char *ExtLog_GetDate(char *Line,struct tm *Date)
266 {
267 bool quote;
268 int year;
269 int month;
270 int day;
271 int next;
272
273 //skip opening double quote
274 quote=(*Line=='\"');
275 if (quote) ++Line;
276 if (sscanf(Line,"%d-%d-%d%n",&year,&month,&day,&next)!=3) return(NULL);
277 Line+=next;
278 if (quote) {
279 if (*Line!='\"') return(NULL);//missing closing quote.
280 ++Line;
281 }
282 Date->tm_year=year-1900;
283 Date->tm_mon=month-1;
284 Date->tm_mday=day;
285 return(Line);
286 }
287
288 /*!
289 Scan through the time in a column.
290
291 \param Line The pointer to the beginning of the string.
292 */
293 static char *ExtLog_GetTime(char *Line,struct tm *Date)
294 {
295 bool quote;
296 int hour;
297 int minute;
298 int second;
299 int next;
300
301 //skip opening double quote
302 quote=(*Line=='\"');
303 if (quote) ++Line;
304 if (sscanf(Line,"%d:%d:%d%n",&hour,&minute,&second,&next)!=3) return(NULL);
305 Line+=next;
306 if (quote) {
307 if (*Line!='\"') return(NULL);//missing closing quote.
308 ++Line;
309 }
310 Date->tm_hour=hour;
311 Date->tm_min=minute;
312 Date->tm_sec=second;
313 return(Line);
314 }
315
316 /*!
317 Scan through a number in a column.
318
319 \param Line The pointer to the beginning of the string.
320 \param Value A variable to store the number.
321 */
322 static char *ExtLog_GetLongInt(char *Line,long int *Value)
323 {
324 bool quote;
325
326 //skip opening double quote
327 quote=(*Line=='\"');
328 if (quote) ++Line;
329 *Value=0;
330 while (isdigit(*Line)) *Value=*Value*10+(*Line++-'0');
331 if (quote) {
332 if (*Line!='\"') return(NULL);//missing closing quote.
333 ++Line;
334 }
335 return(Line);
336 }
337
338 /*!
339 Scan through a number in a column.
340
341 \param Line The pointer to the beginning of the string.
342 \param Value A variable to store the number.
343 */
344 static char *ExtLog_GetLongLongInt(char *Line,long long int *Value)
345 {
346 bool quote;
347
348 //skip opening double quote
349 quote=(*Line=='\"');
350 if (quote) ++Line;
351 *Value=0;
352 while (isdigit(*Line)) *Value=*Value*10+(*Line++-'0');
353 if (quote) {
354 if (*Line!='\"') return(NULL);//missing closing quote.
355 ++Line;
356 }
357 return(Line);
358 }
359
360 /*!
361 Remove the quotes inside the \a string. If no quotes are known to
362 be in the string, the \a end_ptr is the pointer to the last
363 character of the string.
364 */
365 static void ExtLog_FixString(char *string,char *end_ptr)
366 {
367 char *dest;
368
369 if (!string) return;//string not parsed
370 if (end_ptr) { //end is known and no quotes are in the string
371 *end_ptr='\0';
372 return;
373 }
374 // remove the quotes and end at the first unremoveable quote
375 dest=string;
376 while (*string)
377 {
378 if (*string=='\"') {
379 if (string[1]!='\"') break; //closing quote
380 string++;//skip the first quote
381 }
382 *dest++=*string++;
383 }
384 *dest='\0';
385 }
386
387 /*!
388 * Discard a empty string.
389 *
390 * An empty string may contain a single dash.
391 *
392 * \param String The string to check.
393 *
394 * \return The string pointer if it isn't empty or NULL if the string
395 * is empty.
396 */
397 static const char *ExtLog_FixEmptyString(const char *String)
398 {
399 if (String && (String[0]=='\0' || (String[0]=='-' && String[1]=='\0'))) String=NULL;
400 return(String);
401 }
402
403 /*!
404 * Create the URL from the split elements.
405 */
406 static char *ExtLog_ConcatUrl(const char *Scheme,const char *Host,const char *Port,const char *Path,const char *Query)
407 {
408 int tlen=0;
409 int len;
410
411 Host=ExtLog_FixEmptyString(Host);
412 if (!Host) return(NULL);
413 Scheme=ExtLog_FixEmptyString(Scheme);
414 Port=ExtLog_FixEmptyString(Port);
415 Path=ExtLog_FixEmptyString(Path);
416 Query=ExtLog_FixEmptyString(Query);
417
418 if (Scheme)
419 {
420 len=strlen(Scheme);
421 if (tlen+len+3>=sizeof(ExtTempUrl))
422 {
423 debuga(__FILE__,__LINE__,_("URI scheme too long in log file\n"));
424 exit(EXIT_FAILURE);
425 }
426 strcpy(ExtTempUrl,Scheme);
427 strcpy(ExtTempUrl+len,"://");
428 tlen+=len+3;
429 }
430
431 len=strlen(Host);
432 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
433 strncpy(ExtTempUrl+tlen,Host,len);
434 tlen+=len;
435 ExtTempUrl[tlen]='\0';
436
437 if (tlen+2<sizeof(ExtTempUrl) && Port)
438 {
439 len=strlen(Port);
440 if (tlen+len+1>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-2;
441 ExtTempUrl[tlen++]=':';
442 strncpy(ExtTempUrl+tlen,Port,len);
443 tlen+=len;
444 ExtTempUrl[tlen]='\0';
445 }
446
447 if (tlen<sizeof(ExtTempUrl) && Path)
448 {
449 len=strlen(Path);
450 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
451 strncpy(ExtTempUrl+tlen,Path,len);
452 tlen+=len;
453 ExtTempUrl[tlen]='\0';
454 }
455
456 if (tlen<sizeof(ExtTempUrl) && Query)
457 {
458 len=strlen(Query);
459 if (tlen+len>=sizeof(ExtTempUrl)) len=sizeof(ExtTempUrl)-tlen-1;
460 strncpy(ExtTempUrl+tlen,Query,len);
461 tlen+=len;
462 ExtTempUrl[tlen]='\0';
463 }
464 return(ExtTempUrl);
465 }
466
467 /*!
468 Read one entry from an extended log.
469
470 \param Line One line from the input log file.
471 \param Entry Where to store the information parsed from the line.
472
473 \retval RLRC_NoError One valid entry is parsed.
474 \retval RLRC_Unknown The line is invalid.
475 \retval RLRC_InternalError An internal error was encountered.
476 */
477 static enum ReadLogReturnCodeEnum ExtLog_ReadEntry(char *Line,struct ReadLogStruct *Entry)
478 {
479 int col;
480 enum ext_col_id col_id;
481 char *Ip=NULL;
482 char *IpEnd;
483 char *UserEnd;
484 char *UrlEnd;
485 char *HttpCodeEnd;
486 char *UrlScheme=NULL,*UrlSchemeEnd;
487 char *UrlHost=NULL,*UrlHostEnd;
488 char *UrlPort=NULL,*UrlPortEnd;
489 char *UrlPath=NULL,*UrlPathEnd;
490 char *UrlQuery=NULL,*UrlQueryEnd;
491
492 // is it a directive
493 if (*Line=='#') {
494 enum ReadLogReturnCodeEnum status=ExtLog_Directive(Line);
495 if (status!=RLRC_Unknown) InExtLog=true;
496 return(status);
497 }
498 if (!InExtLog) return(RLRC_Unknown);
499
500 col=0;
501 while (*Line) {
502 if (col>=ExtColNumber) {
503 debuga(__FILE__,__LINE__,_("Too many columns in an extended log file format: %d columns found when %d have been announced\n"),col,ExtColNumber);
504 return(RLRC_Unknown);
505 }
506 col_id=ExtLog_WhichColumn(col);
507 switch (col_id)
508 {
509 case EXTCOL_Ip:
510 Entry->Ip=Ip=Line;
511 Line=ExtLog_GetString(Line,col,&IpEnd);
512 if (!Line) return(RLRC_Unknown);
513 break;
514 case EXTCOL_UserName:
515 Entry->User=Line;
516 Line=ExtLog_GetString(Line,col,&UserEnd);
517 if (!Line) return(RLRC_Unknown);
518 break;
519 case EXTCOL_Date:
520 Line=ExtLog_GetDate(Line,&Entry->EntryTime);
521 if (!Line) return(RLRC_Unknown);
522 break;
523 case EXTCOL_Time:
524 Line=ExtLog_GetTime(Line,&Entry->EntryTime);
525 if (!Line) return(RLRC_Unknown);
526 break;
527 case EXTCOL_TimeTaken:
528 Line=ExtLog_GetLongInt(Line,&Entry->ElapsedTime);
529 if (!Line) return(RLRC_Unknown);
530 break;
531 case EXTCOL_Bytes:
532 Line=ExtLog_GetLongLongInt(Line,&Entry->DataSize);
533 if (!Line) return(RLRC_Unknown);
534 break;
535 case EXTCOL_Uri:
536 Entry->Url=Line;
537 Line=ExtLog_GetString(Line,col,&UrlEnd);
538 if (!Line) return(RLRC_Unknown);
539 break;
540 case EXTCOL_Scheme:
541 UrlScheme=Line;
542 Line=ExtLog_GetString(Line,col,&UrlSchemeEnd);
543 if (!Line) return(RLRC_Unknown);
544 break;
545 case EXTCOL_Host:
546 UrlHost=Line;
547 Line=ExtLog_GetString(Line,col,&UrlHostEnd);
548 if (!Line) return(RLRC_Unknown);
549 break;
550 case EXTCOL_Port:
551 UrlPort=Line;
552 Line=ExtLog_GetString(Line,col,&UrlPortEnd);
553 if (!Line) return(RLRC_Unknown);
554 break;
555 case EXTCOL_Path:
556 UrlPath=Line;
557 Line=ExtLog_GetString(Line,col,&UrlPathEnd);
558 if (!Line) return(RLRC_Unknown);
559 break;
560 case EXTCOL_Query:
561 UrlQuery=Line;
562 Line=ExtLog_GetString(Line,col,&UrlQueryEnd);
563 if (!Line) return(RLRC_Unknown);
564 break;
565 case EXTCOL_Status:
566 Entry->HttpCode=Line;
567 Line=ExtLog_GetString(Line,col,&HttpCodeEnd);
568 if (!Line) return(RLRC_Unknown);
569 break;
570 case EXTCOL_Last://ignored column
571 Line=ExtLog_GetString(Line,col,NULL);
572 if (!Line) return(RLRC_Unknown);
573 break;
574 }
575 if (*Line && *Line!=ExtColSep[col]) return(RLRC_Unknown);
576 while (*Line && *Line==ExtColSep[col]) Line++;
577 col++;
578 }
579 if (col!=ExtColNumber) {
580 debuga(__FILE__,__LINE__,_("Only %d columns in an extended log file format when %d have been announced\n"),col,ExtColNumber);
581 return(RLRC_Unknown);
582 }
583
584 // check the entry time
585 if (mktime(&Entry->EntryTime)==-1) {
586 debuga(__FILE__,__LINE__,_("Invalid date or time found in the extended log file\n"));
587 return(RLRC_InternalError);
588 }
589
590 ExtLog_FixString(Ip,IpEnd);
591 ExtLog_FixString(Entry->User,UserEnd);
592 ExtLog_FixString(Entry->Url,UrlEnd);
593 ExtLog_FixString(Entry->HttpCode,HttpCodeEnd);
594 if (!Entry->Url)
595 {
596 ExtLog_FixString(UrlScheme,UrlSchemeEnd);
597 ExtLog_FixString(UrlHost,UrlHostEnd);
598 ExtLog_FixString(UrlPort,UrlPortEnd);
599 ExtLog_FixString(UrlPath,UrlPathEnd);
600 ExtLog_FixString(UrlQuery,UrlQueryEnd);
601 Entry->Url=ExtLog_ConcatUrl(UrlScheme,UrlHost,UrlPort,UrlPath,UrlQuery);
602 }
603
604 return(RLRC_NoError);
605 }
606
607 //! \brief Object to read an extended log.
608 const struct ReadLogProcessStruct ReadExtLog=
609 {
610 /* TRANSLATORS: This is the name of the log format displayed when this format is detected in an input log file. */
611 N_("extended log format"),
612 ExtLog_NewFile,
613 ExtLog_ReadEntry
614 };