From: Frédéric Marchal Date: Sat, 18 Jun 2011 10:31:49 +0000 (+0000) Subject: Alias host names in URL and group identical names X-Git-Tag: v2.3.2~56 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=227153528d900c4b3d213349e91bd1198f164a75;p=thirdparty%2Fsarg.git Alias host names in URL and group identical names The user can write a file providing rules to replace the host names extracted from the URL and displayed in the reports. The rules allow for one wildcard in the host names to be matched. Identical aliased host named are grouped together in the reports. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ab0964..1a77f8e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ PROJECT(sarg C) SET(sarg_VERSION 2) SET(sarg_REVISION "3.2-pre1") SET(sarg_BUILD "") -SET(sarg_BUILDDATE "Apr-24-2011") +SET(sarg_BUILDDATE "Jun-18-2011") INCLUDE(AddFileDependencies) INCLUDE(CheckIncludeFile) @@ -53,7 +53,7 @@ SET(SRC util.c log.c report.c topuser.c email.c sort.c html.c smartfilter.c denied.c authfail.c charset.c squidguard_log.c squidguard_report.c auth.c download.c grepday.c dansguardian_log.c dansguardian_report.c realtime.c btree_cache.c - usertab.c userinfo.c longline.c) + usertab.c userinfo.c longline.c url.c) FOREACH(f ${SRC}) ADD_FILE_DEPENDENCIES(${f} ${CMAKE_BINARY_DIR}/config.h ${CMAKE_SOURCE_DIR}/include/conf.h ${CMAKE_SOURCE_DIR}/include/info.h ${CMAKE_SOURCE_DIR}/include/defs.h) diff --git a/ChangeLog b/ChangeLog index bf1c37d..acea3ab 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,6 @@ SARG ChangeLog -Apr-24-2011 Version 2.3.2-pre1 +Jun-18-2011 Version 2.3.2-pre1 - Add support for sorttable.js (http://www.kryogenix.org/code/browser/sorttable/) to dynamically sort some tables (thanks to Éric). - Add the two command line options --lastlog and --keeplogs to set the number of reports to keep or to keep all the reports respectively (thanks to Emmanuel Lacour for the suggestion). - Report the user ID in the e-mail report. @@ -13,6 +13,7 @@ Apr-24-2011 Version 2.3.2-pre1 - Don't fail for an empty report directory when building the index. - Fix a read error when parsing the time of a common log format. - Use anonymous file and directory names in the report to hide the identity of the user whose report is displayed and shorten the total path length. + - Replace host names by alias and group identical aliased host names. Sep-18-2010 Version 2.3.1 - Remove the distinct printf for the alpha architecture as it doesn't work anymore and is not necessary anyway. diff --git a/Makefile.in b/Makefile.in index a38e28f..ef341fa 100644 --- a/Makefile.in +++ b/Makefile.in @@ -38,7 +38,7 @@ SRCS = util.c log.c report.c topuser.c email.c sort.c html.c \ smartfilter.c denied.c authfail.c charset.c \ squidguard_log.c squidguard_report.c auth.c download.c grepday.c \ dansguardian_log.c dansguardian_report.c realtime.c btree_cache.c \ - usertab.c userinfo.c longline.c + usertab.c userinfo.c longline.c url.c OBJS = $(SRCS:.c=.o) diff --git a/documentation/util.txt b/documentation/util.txt index 73683f6..508d36b 100644 --- a/documentation/util.txt +++ b/documentation/util.txt @@ -755,22 +755,6 @@ Close the header opened by write_html_header(). -/*! \fn void url_hostname(const char *url,char *hostname,int hostsize) -Extract the host name from the URL. - -\param url The url whose host name must be extracted. -\param hostname The buffer to store the host name. -\param hostsize The size of the host name buffer. - -\note The function is stupid at this time. It just searches for the first slash -in the URL and truncates the URL there. It doesn't take the protocol into account -nor the port number nor any user or password information. -*/ - - - - - /*! \fn void url_module(const char *url, char *w2) Copy at most 254 bytes from the end of the URL or stops at the first /. diff --git a/getconf.c b/getconf.c index f26cc80..549d6ff 100644 --- a/getconf.c +++ b/getconf.c @@ -738,6 +738,8 @@ static void parmtest(char *buf) if (getparam_string("sorttable",buf,SortTableJs,sizeof(SortTableJs))>0) return; + if (getparam_string("hostalias",buf,HostAliasFile,sizeof(HostAliasFile))>0) return; + if(strstr(buf,"squid24") != 0) { squid24=true; return; diff --git a/include/conf.h b/include/conf.h index 3ed4a95..c138501 100755 --- a/include/conf.h +++ b/include/conf.h @@ -427,6 +427,8 @@ char LDAPTargetAttr[64]; char GraphFont[MAXLEN]; //! The full path to sorttable.js if the table in the reports must be dynamicaly sorted. char SortTableJs[256]; +//! The name of the file containing the host names to replace by an alias in the report. +char HostAliasFile[512]; int idate; int smartfilter; diff --git a/include/defs.h b/include/defs.h index 20ba906..4fdd093 100755 --- a/include/defs.h +++ b/include/defs.h @@ -190,6 +190,12 @@ void topuser(void); // totday.c void day_totalize(const char *tmp, const struct userinfostruct *uinfo); +// url.c +void read_hostalias(const char *Filename); +void free_hostalias(void); +const char *process_url(char *url); +void url_hostname(const char *url,char *hostname,int hostsize); + // usage.c void usage(const char *prog); @@ -245,7 +251,6 @@ void debuga(const char *msg,...) __attribute__((format(printf,1,2))); void debugaz(const char *head, const char *msg); void my_lltoa(unsigned long long int n, char *s, int ssize, int len); char *get_size(const char *path, const char *file); -void url_hostname(const char *url,char *hostname,int hostsize); void url_module(const char *url, char *w2); void url_to_file(const char *url,char *file,int filesize); void strip_latin(char *line); diff --git a/include/info.h b/include/info.h index dfc5b68..08d173c 100755 --- a/include/info.h +++ b/include/info.h @@ -1,3 +1,3 @@ -#define VERSION PACKAGE_VERSION" Apr-24-2011" +#define VERSION PACKAGE_VERSION" Jun-18-2011" #define PGM PACKAGE_NAME #define URL "http://sarg.sourceforge.net" diff --git a/log.c b/log.c index 6d276f9..4e5d7a1 100644 --- a/log.c +++ b/log.c @@ -110,8 +110,8 @@ int main(int argc,char *argv[]) char start_hour[128]; char end_hour[128]; char *linebuf; - char hostname[512]; - char *url; + const char *url; + char *full_url; char *urly; char user[MAX_USER_LEN]; char splitprefix[MAXLEN]; @@ -326,6 +326,7 @@ int main(int argc,char *argv[]) start_hour[0]='\0'; end_hour[0]='\0'; hm_str[0]='\0'; + HostAliasFile[0]='\0'; denied_count=0; download_count=0; @@ -615,6 +616,8 @@ int main(int argc,char *argv[]) getuexclude(ExcludeUsers,debug); fuser=true; } + if (HostAliasFile[0] != '\0') + read_hostalias(HostAliasFile); indexonly=false; if(fuser) { @@ -948,7 +951,7 @@ int main(int argc,char *argv[]) debuga(_("Maybe you have a broken record or garbage in your %s file\n"),arq); exit(EXIT_FAILURE); } - if (getword_ptr(linebuf,&url,&gwarea,' ')<0) { + if (getword_ptr(linebuf,&full_url,&gwarea,' ')<0) { debuga(_("Maybe you have a broken url in your %s file\n"),arq); exit(EXIT_FAILURE); } @@ -1046,7 +1049,7 @@ int main(int argc,char *argv[]) debuga(_("Maybe you have a broken request method in your %s file\n"),arq); exit(EXIT_FAILURE); } - if (getword_ptr(linebuf,&url,&gwarea,' ')<0){ + if (getword_ptr(linebuf,&full_url,&gwarea,' ')<0){ debuga(_("Maybe you have a broken url in your %s file\n"),arq); exit(EXIT_FAILURE); } @@ -1087,7 +1090,7 @@ int main(int argc,char *argv[]) debuga(_("Maybe you have a broken record or garbage in your %s file\n"),arq); exit(EXIT_FAILURE); } - if (getword_ptr(linebuf,&url,&gwarea,'\t')<0){ + if (getword_ptr(linebuf,&full_url,&gwarea,'\t')<0){ debuga(_("Maybe you have a broken record or garbage in your %s file\n"),arq); exit(EXIT_FAILURE); } @@ -1208,7 +1211,7 @@ int main(int argc,char *argv[]) } strcpy(tam,str); } else if (x==isa_cols[ISACOL_Uri]) { - url=str; + full_url=str; } else if (x==isa_cols[ISACOL_Status]) { if (strlen(str)>=sizeof(code)) { debuga(_("Maybe you have a broken access code in your %s file\n"),arq); @@ -1320,37 +1323,23 @@ int main(int argc,char *argv[]) } #endif - urly=url; + urly=full_url; if(ilf!=ILF_Sarg) { /* The full URL is not saved in sarg log. There is no point in testing the URL to detect a downloaded file. */ - download_flag=is_download_suffix(url); + download_flag=is_download_suffix(full_url); if (download_flag) { - download_url=url; + download_url=full_url; download_count++; } } else download_flag=false; - // remove any protocol:// at the beginning of the URL - if ((str = strchr(url,'/')) != NULL && str[1] == '/') { - int i; - - str+=2; - for (i=0 ; str[i] ; i++) - url[i]=str[i]; - url[i]='\0'; - } - - if(!LongUrl) { - url_hostname(url,hostname,sizeof(hostname)); - url=hostname; - } - - if(url[0] == '\0') continue; + url=process_url(full_url); + if (!url || url[0] == '\0') continue; if(addr[0] != '\0'){ if(strcmp(addr,ip)!=0) continue; @@ -1582,6 +1571,7 @@ int main(int argc,char *argv[]) free_download(); free_excludecodes(); free_exclude(); + free_hostalias(); if(debug) { int totalcount=0; diff --git a/sarg.conf b/sarg.conf index 3c29f12..dcfda71 100644 --- a/sarg.conf +++ b/sarg.conf @@ -728,3 +728,26 @@ # for the implementation on which sarg is based. # # sorttable /sorttable.js + +# TAG: hostalias +# The name of a text file containing the host names one per line and the +# optional alias to use in the report instead of that host name. +# Host names may contain up to one wildcard denoted by a *. The wildcard +# must not end the host name. +# The host name may be followed by an optional alias but if no alias is +# provided, the host name, including the wildcard, replaces any matching +# host name found in the log. +# Host names replaced by identical aliases are grouped together in the +# reports. +# IP addresses are processed as string. There is no support for CIDR +# notation. +# +# Example: +# *.gstatic.com +# mt*.google.com +# *.myphone.microsoft.com +# *.myphone.microsoft.com:443 *.myphone.microsoft.com:secure +# *.freeav.net antivirus:freeav +# *.mail.live.com +# 65.55.33.119 *.mail.live.com +#hostalias /usr/local/sarg/hostalias diff --git a/url.c b/url.c new file mode 100644 index 0000000..43df14d --- /dev/null +++ b/url.c @@ -0,0 +1,308 @@ +/* + * SARG Squid Analysis Report Generator http://sarg.sourceforge.net + * 1998, 2011 + * + * SARG donations: + * please look at http://sarg.sourceforge.net/donations.php + * Support: + * http://sourceforge.net/projects/sarg/forums/forum/363374 + * --------------------------------------------------------------------- + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. + * + */ + +#include "include/conf.h" +#include "include/defs.h" + +/*! +A host name and the name to report. +*/ +struct hostalias +{ + //! The next host name in the list or NULL for the last item. + struct hostalias *Next; + //! The minimum length of a candidate host name. + int MinLen; + //! The length of the constant part at the beginning of the mask. + int PrefixLen; + //! The length of the constant part at the end of the mask. + int SuffixLen; + //! The first part of the mask of the host name. + const char *HostName_Prefix; + //! The second part of the mask of the host name. + const char *HostName_Suffix; + //! The replacement name. + const char *Alias; +}; + +//! The first host name. +struct hostalias *FirstAlias=NULL; + +/*! +Read the file containing the host names to alias in the report. + +\param Filename The name of the file. +*/ +void read_hostalias(const char *Filename) +{ + FILE *fi; + longline line; + char *buf; + char *str; + char *NameBegin; + char *NameEnd; + char *Replace; + struct hostalias *alias; + struct hostalias *prev_alias; + struct hostalias *new_alias; + int cmp; + + if (debug) debuga(_("Reading host alias file \"%s\"\n"),Filename); + fi=fopen(Filename,"rt"); + if (!fi) { + debuga(_("Cannot read host name alias file \"%s\" - %s\n"),Filename,strerror(errno)); + exit(EXIT_FAILURE); + } + + if ((line=longline_create())==NULL) { + debuga(_("Not enough memory to read the host name aliases\n")); + exit(EXIT_FAILURE); + } + + prev_alias=NULL; + while ((buf=longline_read(fi,line)) != NULL) { + // get host name and split at the first wildcards + NameBegin=buf; + while (*NameBegin==' ' || *NameBegin=='\t') NameBegin++; + if ((unsigned char)*NameBegin<' ' || *NameBegin=='#' || *NameBegin==';') continue; + for (str=NameBegin ; *str && (unsigned char)*str>' ' && *str!='*' ; str++) + *str=tolower(*str); + if (*str=='*') { + *str++='\0'; + NameEnd=str; + while (*str && (unsigned char)*str>' ') { + if (*str=='*') { + debuga(_("Host name alias \"%s*%s\" contains too many wildcards (*) in \"%s\""),NameBegin,NameEnd,Filename); + exit(EXIT_FAILURE); + } + *str=tolower(*str); + str++; + } + } else + NameEnd=NULL; + while (*str && (unsigned char)*str<=' ') *str++='\0'; + if (NameEnd && NameEnd[0]=='\0') { + debuga(_("Host name alias \"%s\" must not end with a wildcard"),NameEnd); + exit(EXIT_FAILURE); + } + if (NameBegin[0]=='\0') NameBegin=NULL; + if (!NameBegin && !NameEnd) continue; + + // get the alias + Replace=str; + while (*Replace==' ' || *Replace=='\t') Replace++; + if ((unsigned char)*Replace<' ') { + Replace=NULL; + } else { + for (str=Replace ; *str && (unsigned char)*str>=' ' ; str++); + *str='\0'; + } + + // ignore duplicates + cmp=1; + for (alias=FirstAlias ; alias ; alias=alias->Next) { + if (((NameBegin && alias->HostName_Prefix && !strcmp(NameBegin,alias->HostName_Prefix)) || (!NameBegin && !alias->HostName_Prefix)) && + ((NameEnd && alias->HostName_Suffix && !strcmp(NameEnd,alias->HostName_Suffix)) || (!NameEnd && !alias->HostName_Suffix))) { + cmp=0; + break; + } + } + if (!cmp) continue; + + // insert into the list + new_alias=malloc(sizeof(*new_alias)); + if (!new_alias) { + debuga(_("Not enough memory to store the host name aliasing directives read in \"%s\""),Filename); + exit(EXIT_FAILURE); + } + new_alias->MinLen=0; + if (NameBegin) { + new_alias->HostName_Prefix=strdup(NameBegin); + if (!new_alias->HostName_Prefix) { + debuga(_("Not enough memory to store the host name aliasing directives read in \"%s\""),Filename); + exit(EXIT_FAILURE); + } + new_alias->MinLen+=strlen(NameBegin); + new_alias->PrefixLen=strlen(NameBegin); + } else { + new_alias->HostName_Prefix=NULL; + new_alias->PrefixLen=0; + } + if (NameEnd) { + new_alias->HostName_Suffix=strdup(NameEnd); + if (!new_alias->HostName_Suffix) { + debuga(_("Not enough memory to store the host name aliasing directives read in \"%s\""),Filename); + exit(EXIT_FAILURE); + } + new_alias->MinLen+=strlen(NameEnd)+1; + new_alias->SuffixLen=strlen(NameEnd); + } else { + new_alias->HostName_Suffix=NULL; + new_alias->SuffixLen=0; + } + if (Replace) { + new_alias->Alias=strdup(Replace); + if (!new_alias->Alias) { + debuga(_("Not enough memory to store the host name aliasing directives read in \"%s\""),Filename); + exit(EXIT_FAILURE); + } + } else { + char *tmp; + tmp=malloc(new_alias->MinLen); + if (!tmp) { + debuga(_("Not enough memory to store the host name aliasing directives read in \"%s\""),Filename); + exit(EXIT_FAILURE); + } + if (new_alias->HostName_Prefix) strcpy(tmp,new_alias->HostName_Prefix); + if (new_alias->HostName_Suffix) { + tmp[new_alias->PrefixLen]='*'; + strcpy(tmp+new_alias->PrefixLen+1,new_alias->HostName_Suffix); + } + new_alias->Alias=tmp; + } + + new_alias->Next=NULL; + if (prev_alias) + prev_alias->Next=new_alias; + else + FirstAlias=new_alias; + prev_alias=new_alias; + } + + longline_destroy(&line); + fclose(fi); + + if (debug) { + debuga(_("List of host names to alias:\n")); + for (alias=FirstAlias ; alias ; alias=alias->Next) { + if (alias->HostName_Prefix && alias->HostName_Suffix) + debuga(_(" %s*%s => %s\n"),alias->HostName_Prefix,alias->HostName_Suffix,alias->Alias); + else if (alias->HostName_Prefix) + debuga(_(" %s => %s\n"),alias->HostName_Prefix,alias->Alias); + else + debuga(_(" *%s => %s\n"),alias->HostName_Suffix,alias->Alias); + } + } +} + +/*! +Free the memory allocated by read_hostalias(). +*/ +void free_hostalias(void) +{ + struct hostalias *alias; + struct hostalias *next; + + for (alias=FirstAlias ; alias ; alias=next) { + next=alias->Next; + if (alias->HostName_Prefix) free((void *)alias->HostName_Prefix); + if (alias->HostName_Suffix) free((void *)alias->HostName_Suffix); + free((void *)alias->Alias); + free(alias); + } +} + +/*! +Replace the host name by its alias if it is in our list. + +\param host The host name. + +\return The pointer to the host name or its alias. +*/ +const char *alias_url(const char *url) +{ + struct hostalias *alias; + int len; + + len=strlen(url); + for (alias=FirstAlias ; alias ; alias=alias->Next) { + if (lenMinLen) continue; + if (alias->HostName_Prefix) { + if (alias->HostName_Suffix) { + if (strncasecmp(url,alias->HostName_Prefix,alias->PrefixLen)==0 && + strcasecmp(url+(len-alias->SuffixLen),alias->HostName_Suffix)==0) { + return(alias->Alias); + } + } else { + if (len==alias->PrefixLen && strcasecmp(url,alias->HostName_Prefix)==0) { + return(alias->Alias); + } + } + } else if (strcasecmp(url+(len-alias->SuffixLen),alias->HostName_Suffix)==0) { + return(alias->Alias); + } + } + return(url); +} + +/*! +Get the part of the URL necessary to generate the report. + +\param url The URL as extracted from the report. +*/ +const char *process_url(char *url) +{ + char *str; + const char *start; + + // remove any scheme:// at the beginning of the URL (see rfc2396 section 3.1) + for (str=url ; *str && (isalnum(*str) || *str=='+' || *str=='-' || *str=='.') ; str++); + if (*str==':' && str[1]=='/') { + url=str+1; + while (*url=='/') url++; + } + + start=url; + if (!LongUrl) { + for (str=url ; *str && *str!='/' ; str++); + if (*str=='/') *str='\0'; + if (FirstAlias) + start=alias_url(start); + } + return(start); +} + +/*! +Extract the host name from the URL. + +\param url The url whose host name must be extracted. +\param hostname The buffer to store the host name. +\param hostsize The size of the host name buffer. + +\note The function is stupid at this time. It just searches for the first slash +in the URL and truncates the URL there. It doesn't take the protocol into account +nor the port number nor any user or password information. +*/ +void url_hostname(const char *url,char *hostname,int hostsize) +{ + int i; + + hostsize--; + for (i=0 ; i