From 86d99d08c82bbf23d2ecf90adb36fbaba2d08af0 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fr=C3=A9d=C3=A9ric=20Marchal?= Date: Sun, 26 Aug 2012 15:46:51 +0200 Subject: [PATCH] Decode extended log formats Microsoft ISA produces such a log. This change is supposed to handle more general cases than the previous routine. The current code successfully decode the one line long log I have to test the code. The decoding procedure may not be compatible with *any* compliant extended log implementation. Sample logs are necessary to improve the code. --- readlog_extlog.c | 417 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 417 insertions(+) diff --git a/readlog_extlog.c b/readlog_extlog.c index 889a5a4..31d124d 100644 --- a/readlog_extlog.c +++ b/readlog_extlog.c @@ -27,11 +27,343 @@ #include "include/conf.h" #include "include/defs.h" +/*! +Maximum number of columns accepted in an extended log format. + +The current value is an arbitrary number chosen to have an +actual limit. +*/ +#define MAX_EXT_COLUMNS 250 + +enum ext_col_id { + EXTCOL_Ip, + EXTCOL_UserName, + EXTCOL_Date, + EXTCOL_Time, + EXTCOL_TimeTaken, + EXTCOL_Bytes, + EXTCOL_Uri, + EXTCOL_Status, + EXTCOL_Last //last entry of the list ! +}; + +//! \c True if the extended common long format is confirmed. +static bool InExtLog=false; +//! The index of relevant columns in the log file. +static int ExtCols[EXTCOL_Last]; +//! The character to use as a columns separator. +static char ExtColSep[MAX_EXT_COLUMNS]; +//! The number of columns according to the "fields" directive. +static int ExtColNumber; + /*! A new file is being read. The name of the file is \a FileName. */ static void ExtLog_NewFile(const char *FileName) { + InExtLog=false; + ExtColNumber=0; +} + +/*! +Parse the "Fields" directive listing the columns in the log. The +\a columns is a pointer to the first column of the directive. + +\return \c True if the fields is valid or false if it could not +be decoded. +*/ +static bool ExtLog_Fields(const char *columns) +{ + int col; + int len; + int prefix; + int header_start; + int header_end; + int i; + enum ext_col_id col_id; + char col_sep; + // see http://www.w3.org/TR/WD-logfile.html for the list of prefixes + const char const *prefixes[]= + { + "c", + "s", + "r", + "cs", + "sc", + "sr", + "rs", + "x", + }; + + for (i=0 ; i=MAX_EXT_COLUMNS) { + debuga(_("Too many columns found in an extended log format. The maximum allowed is %d but it can be changed if a bigger value is legitimate\n"),MAX_EXT_COLUMNS); + exit(EXIT_FAILURE); + } + prefix=-1; + header_start=-1; + header_end=-1; + for (i=sizeof(prefixes)/sizeof(*prefixes)-1 ; i>=0 ; i--) { + len=strlen(prefixes[i]); + if (strncasecmp(columns,prefixes[i],len)==0) { + if (columns[len]=='-') { + prefix=len++; + break; + } else if (columns[len]=='(') { + header_start=len++; + break; + } + } + } + if (i<0) len=0; + for ( ; (unsigned char)columns[len]>' ' ; len++) {//skip a word and accept any separator (tab or space) + if (header_start>=0 && columns[len]==')') header_end=len; + } + col_sep=columns[len]; + ExtColSep[col]=col_sep; + + // see http://www.w3.org/TR/WD-logfile.html for list of possible identifiers + col_id=EXTCOL_Last; + if (len==4) { + if (strncasecmp(columns,"c-ip",len)==0 && ExtCols[EXTCOL_Ip]<0) col_id=EXTCOL_Ip; + else if (strncasecmp(columns,"date",len)==0) col_id=EXTCOL_Date; + else if (strncasecmp(columns,"time",len)==0) col_id=EXTCOL_Time; + } else if (len==5) { + if (strncasecmp(columns,"c-dns",len)==0) col_id=EXTCOL_Ip; + } else if (len==6) { + if (strncasecmp(columns,"cs-uri",len)==0) col_id=EXTCOL_Uri; + } else if (len==8) { + if (strncasecmp(columns,"sc-bytes",len)==0) col_id=EXTCOL_Bytes; + } else if (len==9) { + if (strncasecmp(columns,"sc-status",len)==0) col_id=EXTCOL_Status; + } else if (len==10) { + if (strncasecmp(columns,"time-taken",len)==0) col_id=EXTCOL_TimeTaken; + } else if (len==11) { + if (strncasecmp(columns,"cs-username",len)==0) col_id=EXTCOL_UserName; + } + if (col_id!=EXTCOL_Last) { + ExtCols[col_id]=col; + } + + col++; + columns+=len; + while (*columns && (unsigned char)*columns<=' ') { + if (*columns!=col_sep) { + debuga(_("Multiple column separators found between two columns in the \"fields\" directive of an extended log format\n")); + exit(EXIT_FAILURE); + } + columns++; + } + } + ExtColNumber=col; + return(true); +} + +/*! +Decode a directive field from the \a Line. + +\return RLRC_Ignore if the line is a directive or RLRC_Unknown +if the line is not a known directive. +*/ +static enum ReadLogReturnCodeEnum ExtLog_Directive(const char *Line) +{ + ++Line; + if (strncasecmp(Line,"Version:",8)==0) return(RLRC_Ignore); + if (strncasecmp(Line,"Software:",9)==0) return(RLRC_Ignore); + if (strncasecmp(Line,"Start-Date:",11)==0) return(RLRC_Ignore); + if (strncasecmp(Line,"End-Date:",9)==0) return(RLRC_Ignore); + if (strncasecmp(Line,"Date:",5)==0) return(RLRC_Ignore); + if (strncasecmp(Line,"Remark:",7)==0) return(RLRC_Ignore); + if (strncasecmp(Line,"Fields:",7)==0) { + Line+=7; + while (*Line==' ' || *Line=='\t') Line++; + if (!ExtLog_Fields(Line)) return(RLRC_Unknown); + return(RLRC_Ignore); + } + return(RLRC_Unknown); +} + +/*! +Get the type of the column \a col_num. + +\return The type of the column or EXTCOL_Last if +the column must be ignored. +*/ +static enum ext_col_id ExtLog_WhichColumn(int col_num) +{ + int i; + + for (i=0 ; itm_year=year; + Date->tm_mon=month; + Date->tm_mday=day; + return(Line); +} + +/*! +Scan through the time in a column. + +\param Line The pointer to the beginning of the string. +*/ +static char *ExtLog_GetTime(char *Line,struct tm *Date) +{ + bool quote; + int hour; + int minute; + int second; + int next; + + //skip opening double quote + quote=(*Line=='\"'); + if (quote) ++Line; + if (sscanf(Line,"%d:%d:%d%n",&hour,&minute,&second,&next)!=3) return(NULL); + Line+=next; + if (quote) { + if (*Line!='\"') return(NULL);//missing closing quote. + ++Line; + } + Date->tm_hour=hour; + Date->tm_min=minute; + Date->tm_sec=second; + return(Line); +} + +/*! +Scan through a number in a column. + +\param Line The pointer to the beginning of the string. +\param Value A variable to store the number. +*/ +static char *ExtLog_GetLongInt(char *Line,long int *Value) +{ + bool quote; + + //skip opening double quote + quote=(*Line=='\"'); + if (quote) ++Line; + *Value=0; + while (isdigit(*Line)) *Value=*Value*10+(*Line++-'0'); + if (quote) { + if (*Line!='\"') return(NULL);//missing closing quote. + ++Line; + } + return(Line); +} + +/*! +Scan through a number in a column. + +\param Line The pointer to the beginning of the string. +\param Value A variable to store the number. +*/ +static char *ExtLog_GetLongLongInt(char *Line,long long int *Value) +{ + bool quote; + + //skip opening double quote + quote=(*Line=='\"'); + if (quote) ++Line; + *Value=0; + while (isdigit(*Line)) *Value=*Value*10+(*Line++-'0'); + if (quote) { + if (*Line!='\"') return(NULL);//missing closing quote. + ++Line; + } + return(Line); +} + +/*! +Remove the quotes inside the \a string. If no quotes are known to +be in the string, the \a end_ptr is the pointer to the last +character of the string. +*/ +static void ExtLog_FixString(char *string,char *end_ptr) +{ + char *dest; + + if (!string) return;//string not parsed + if (end_ptr) { //end is known and no quotes are in the string + *end_ptr='\0'; + return; + } + // remove the quotes and end at the first unremoveable quote + dest=string; + while (*string) + { + if (*string=='\"') { + if (string[1]!='\"') break; //closing quote + string++;//skip the first quote + } + *dest++=*string++; + } + *dest='\0'; } /*! @@ -46,6 +378,91 @@ Read one entry from an extended log. */ static enum ReadLogReturnCodeEnum ExtLog_ReadEntry(char *Line,struct ReadLogStruct *Entry) { + int col; + enum ext_col_id col_id; + char *IpEnd; + char *UserEnd; + char *UrlEnd; + char *HttpCodeEnd; + + // is it a directive + if (*Line=='#') { + enum ReadLogReturnCodeEnum status=ExtLog_Directive(Line); + if (status!=RLRC_Unknown) InExtLog=true; + return(status); + } + if (!InExtLog) return(RLRC_Unknown); + + col=0; + while (*Line) { + if (col>=ExtColNumber) { + debuga(_("Too many columns in an extended log file format: %d columns found when %d have been announced\n"),col,ExtColNumber); + return(RLRC_Unknown); + } + col_id=ExtLog_WhichColumn(col); + switch (col_id) + { + case EXTCOL_Ip: + Entry->Ip=Line; + Line=ExtLog_GetString(Line,col,&IpEnd); + if (!Line) return(RLRC_Unknown); + break; + case EXTCOL_UserName: + Entry->User=Line; + Line=ExtLog_GetString(Line,col,&UserEnd); + if (!Line) return(RLRC_Unknown); + break; + case EXTCOL_Date: + Line=ExtLog_GetDate(Line,&Entry->EntryTime); + if (!Line) return(RLRC_Unknown); + break; + case EXTCOL_Time: + Line=ExtLog_GetTime(Line,&Entry->EntryTime); + if (!Line) return(RLRC_Unknown); + break; + case EXTCOL_TimeTaken: + Line=ExtLog_GetLongInt(Line,&Entry->ElapsedTime); + if (!Line) return(RLRC_Unknown); + break; + case EXTCOL_Bytes: + Line=ExtLog_GetLongLongInt(Line,&Entry->DataSize); + if (!Line) return(RLRC_Unknown); + break; + case EXTCOL_Uri: + Entry->Url=Line; + Line=ExtLog_GetString(Line,col,&UrlEnd); + if (!Line) return(RLRC_Unknown); + break; + case EXTCOL_Status: + Entry->HttpCode=Line; + Line=ExtLog_GetString(Line,col,&HttpCodeEnd); + if (!Line) return(RLRC_Unknown); + break; + case EXTCOL_Last://ignored column + Line=ExtLog_GetString(Line,col,NULL); + if (!Line) return(RLRC_Unknown); + break; + } + if (*Line && *Line!=ExtColSep[col]) return(RLRC_Unknown); + while (*Line && *Line==ExtColSep[col]) Line++; + col++; + } + if (col!=ExtColNumber) { + debuga(_("Only %d columns in an extended log file format when %d have been announced\n"),col,ExtColNumber); + return(RLRC_Unknown); + } + + // check the entry time + if (mktime(&Entry->EntryTime)==-1) { + debuga(_("Invalid date or time found in the extended log file\n")); + return(RLRC_InternalError); + } + + ExtLog_FixString(Entry->Ip,IpEnd); + ExtLog_FixString(Entry->User,UserEnd); + ExtLog_FixString(Entry->Url,UrlEnd); + ExtLog_FixString(Entry->HttpCode,HttpCodeEnd); + return(RLRC_NoError); } -- 2.39.5