From: eldy <> Date: Sun, 2 Aug 2009 23:25:26 +0000 (+0000) Subject: Major update of databases. X-Git-Tag: AWSTATS_6_95_BETA~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d85e4d05ebd3d1d3c9f2ccf6486cdc85a629c9fe;p=thirdparty%2FAWStats.git Major update of databases. Increase seriously bot detection. --- diff --git a/wwwroot/cgi-bin/awstats.pl b/wwwroot/cgi-bin/awstats.pl index 37fd7ab5..a5a4f13c 100644 --- a/wwwroot/cgi-bin/awstats.pl +++ b/wwwroot/cgi-bin/awstats.pl @@ -11566,8 +11566,8 @@ if ( $UpdateStats && $FrameName ne 'index' && $FrameName ne 'mainleft' ) } } - # Analyze: Robot from "hit on robots.txt" file (=> countedtraffic=5 if robot) - # ------------------------------------------------------------------------- + # Analyze: Robot from "hit on robots.txt" file (=> countedtraffic=5 if robot) + # ------------------------------------------------------------------------- if ( !$countedtraffic ) { if ( $urlwithnoquery =~ /$regrobot/o ) { if ($Debug) { debug( " It's an unknown robot", 2 ); } @@ -11685,8 +11685,7 @@ if ( $UpdateStats && $FrameName ne 'index' && $FrameName ne 'mainleft' ) # Do DNS lookup #-------------- my $Host = $field[$pos_host]; - my $HostResolved = '' - ; # HostResolved will be defined in next paragraf if countedtraffic is true + my $HostResolved = ''; # HostResolved will be defined in next paragraf if countedtraffic is true if ( !$countedtraffic ) { my $ip = 0; @@ -11711,7 +11710,7 @@ if ( $UpdateStats && $FrameName ne 'index' && $FrameName ne 'mainleft' ) } elsif ( $DNSLookup == 1 ) { - # Check in session cache (dynamic DNS cache file + session DNS cache) + # Check in session cache (dynamic DNS cache file + session DNS cache) $HostResolved = $TmpDNSLookup{$Host}; if ( !$HostResolved ) { if ( @SkipDNSLookupFor && &SkipDNSLookup($Host) ) { diff --git a/wwwroot/cgi-bin/lib/browsers.pm b/wwwroot/cgi-bin/lib/browsers.pm index 749dc04d..982dcfe0 100644 --- a/wwwroot/cgi-bin/lib/browsers.pm +++ b/wwwroot/cgi-bin/lib/browsers.pm @@ -143,6 +143,7 @@ # RSS Readers 'abilon', 'aggrevator', +'aiderss', 'akregator', 'applesyndication', 'betanews_reader', @@ -192,13 +193,16 @@ 'ericsson', # Ericsson (must be after sonyericsson) 'mmef', 'mspie', +'vodafone', 'wapalizer', 'wapsilon', +'wap', # Generic WAP phone (must be after 'wap*') 'webcollage', 'up\.', # Works for UP.Browser and UP.Link +# PDA/Phonecell browsers 'blackberry', -# PDA/Phonecell I-Mode browsers 'docomo', +'iphone', 'portalmmm', # Others (TV) 'webtv', @@ -234,9 +238,6 @@ 'microsoft\-webdav\-miniredir', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav', -'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery', -'microsoft\soffice\sprotocol\sdiscovery', -'microsoft\soffice\sexistence\sdiscovery', 'POE\-Component\-Client\-HTTP', 'mozilla', # Must be at end because a lot of browsers contains mozilla in string 'libwww', # Must be at end because some browser have both 'browser id' and 'libwww' @@ -363,6 +364,7 @@ # RSS Readers 'abilon','Abilon (RSS Reader)', 'aggrevator', 'Aggrevator (RSS Reader)', +'aiderss', 'AideRSS (RSS Reader)', 'akregator','Akregator (RSS Reader)', 'applesyndication','AppleSyndication (RSS Reader)', 'betanews_reader','Betanews Reader (RSS Reader)', @@ -412,13 +414,16 @@ 'ericsson','Ericsson Browser (PDA/Phone browser)', # Must be after SonyEricsson 'mmef','Microsoft Mobile Explorer (PDA/Phone browser)', 'mspie','MS Pocket Internet Explorer (PDA/Phone browser)', +'vodafone','Vodaphone browser (PDA/Phone browser)', 'wapalizer','WAPalizer (PDA/Phone browser)', 'wapsilon','WAPsilon (PDA/Phone browser)', +'wap','Unknown WAP browser (PDA/Phone browser)', # Generic WAP phone (must be after 'wap*') 'webcollage','WebCollage (PDA/Phone browser)', 'up\.','UP.Browser (PDA/Phone browser)', # Works for UP.Browser and UP.Link +# PDA/Phonecell browsers 'blackberry','BlackBerry (PDA/Phone browser)', -# PDA/Phonecell I-Mode browsers 'docomo','I-Mode phone (PDA/Phone browser)', +'iphone','IPhone (PDA/Phone browser)', 'portalmmm','I-Mode phone (PDA/Phone browser)', # Others (TV) 'webtv','WebTV browser', @@ -454,9 +459,6 @@ 'microsoft\-webdav\-miniredir', 'Microsoft Data Access Component Internet Publishing Provider', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager', 'Microsoft Data Access Component Internet Publishing Provider Cache Manager', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav', 'Microsoft Data Access Component Internet Publishing Provider DAV', -'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery', 'Microsoft Data Access Component Internet Publishing Provider Protocol Discovery', -'microsoft\soffice\sprotocol\sdiscovery','Microsoft Office Protocol Discovery', -'microsoft\soffice\sexistence\sdiscovery','Microsoft Office Existence Discovery', 'POE\-Component\-Client\-HTTP','HTTP user-agent for POE (portable networking framework for Perl)', 'mozilla','Mozilla', 'libwww','LibWWW', @@ -572,36 +574,10 @@ 'xaudio','mediaplayer', 'xine','mediaplayer', 'xmms','mediaplayer', -# PDA/Phonecell browsers -'alcatel','pdaphone', # Alcatel -'lg\-','pdaphone', # LG -'ericsson','pdaphone', # Ericsson -'mot\-','pdaphone', # Motorola -'nokia','pdaphone', # Nokia -'panasonic','pdaphone', # Panasonic -'philips','pdaphone', # Philips -'sagem','pdaphone', # Sagem -'samsung','pdaphone', # Samsung -'sie\-','pdaphone', # SIE -'sec\-','pdaphone', # Sony/Ericsson -'sonyericsson','pdaphone', # Sony/Ericsson -'mmef','pdaphone', -'mspie','pdaphone', -'wapalizer','pdaphone', -'wapsilon','pdaphone', -'webcollage','pdaphone', -'up\.','pdaphone', # Works for UP.Browser and UP.Link -'blackberry','pdaphone', -# PDA/Phonecell I-Mode browsers -'docomo','pdaphone', -'portalmmm','pdaphone', -# Others (TV) -'webtv','webtv', -# Anonymous Proxy Browsers (can be used as grabbers as well...) -'cjb\.net','cjbnet', # RSS Readers 'abilon', 'abilon', 'aggrevator', 'rss', +'aiderss', 'rss', 'akregator', 'rss', 'applesyndication', 'rss', 'betanews_reader','rss', @@ -635,6 +611,36 @@ 'syndirella', 'rss', 'vienna', 'rss', 'wizz\srss\snews\sreader','wizz', +# PDA/Phonecell browsers +'alcatel','pdaphone', # Alcatel +'lg\-','pdaphone', # LG +'ericsson','pdaphone', # Ericsson +'mot\-','pdaphone', # Motorola +'nokia','pdaphone', # Nokia +'panasonic','pdaphone', # Panasonic +'philips','pdaphone', # Philips +'sagem','pdaphone', # Sagem +'samsung','pdaphone', # Samsung +'sie\-','pdaphone', # SIE +'sec\-','pdaphone', # Sony/Ericsson +'sonyericsson','pdaphone', # Sony/Ericsson +'mmef','pdaphone', +'mspie','pdaphone', +'vodafone','pdaphone', +'wapalizer','pdaphone', +'wapsilon','pdaphone', +'wap','pdaphone', # Generic WAP phone (must be after 'wap*') +'webcollage','pdaphone', +'up\.','pdaphone', # Works for UP.Browser and UP.Link +# PDA/Phonecell browsers +'blackberry','pdaphone', +'docomo','pdaphone', +'iphone','pdaphone', +'portalmmm','pdaphone', +# Others (TV) +'webtv','webtv', +# Anonymous Proxy Browsers (can be used as grabbers as well...) +'cjb\.net','cjbnet', # Other kind of browsers 'apt','apt', 'analogx_proxy','analogx', diff --git a/wwwroot/cgi-bin/lib/browsers_phone.pm b/wwwroot/cgi-bin/lib/browsers_phone.pm index 8576d187..1294fc87 100644 --- a/wwwroot/cgi-bin/lib/browsers_phone.pm +++ b/wwwroot/cgi-bin/lib/browsers_phone.pm @@ -117,7 +117,8 @@ 'xmms', # RSS Readers 'abilon', -'aggrevator', +'aggrevator', +'aiderss', 'akregator', 'applesyndication', 'betanews_reader', @@ -556,11 +557,15 @@ 'n21i', 'n22i', 'ts21i', -# PDA/Phonecell I-Mode browsers +'wap', # Generic WAP phone (must be after 'wap*') +'up\.', # Works for UP.Browser and UP.Link +# PDA/Phonecell browsers +'blackberry', +'cnf2', 'docomo', -'portalmmm', 'ipcheck', -'cnf2', +'iphone', +'portalmmm', # Others (TV) 'webtv', # Anonymous Proxy Browsers (can be used as grabbers as well...) @@ -589,7 +594,6 @@ 'microsoft\-webdav\-miniredir', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav', -'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery', 'POE\-Component\-Client\-HTTP', 'mozilla', # Must be at end because a lot of browsers contains mozilla in string 'libwww', # Must be at end because some browser have both 'browser id' and 'libwww' @@ -692,6 +696,7 @@ # RSS Readers 'abilon','Abilon (RSS Reader)', 'aggrevator', 'Aggrevator (RSS Reader)', +'aiderss', 'AideRSS (RSS Reader)', 'akregator','Akregator (RSS Reader)', 'applesyndication','AppleSyndication (RSS Reader)', 'betanews_reader','Betanews Reader (RSS Reader)', @@ -1130,11 +1135,15 @@ 'n21i','I-Mode Nec 21i (phone)', 'n22i','I-Mode Nec 22i (phone)', 'ts21i','I-Mode Toshiba 21i (phone)', -# PDA/Phonecell I-Mode browsers +'wap','Unknown WAP browser (PDA/Phone browser)', # Generic WAP phone (must be after 'wap*') +'up\.','UP.Browser (PDA/Phone browser)', # Works for UP.Browser and UP.Link +# PDA/Phonecell browsers +'blackberry','BlackBerry (PDA/Phone browser)', +'cnf2','Supervision I-Mode ByTel (phone)', 'docomo','I-Mode phone (PDA/Phone browser)', 'portalmmm','I-Mode phone (PDA/Phone browser)', 'ipcheck','Supervision IP Check (phone)', -'cnf2','Supervision I-Mode ByTel (phone)', +'iphone','IPhone (PDA/Phone browser)', # Others (TV) 'webtv','WebTV browser', # Anonymous Proxy Browsers (can be used as grabbers as well...) @@ -1163,7 +1172,6 @@ 'microsoft\-webdav\-miniredir', 'Microsoft Data Access Component Internet Publishing Provider', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager', 'Microsoft Data Access Component Internet Publishing Provider Cache Manager', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav', 'Microsoft Data Access Component Internet Publishing Provider DAV', -'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery', 'Microsoft Data Access Component Internet Publishing Provider Protocol Discovery', 'POE\-Component\-Client\-HTTP','HTTP user-agent for POE (portable networking framework for Perl)', 'mozilla','Mozilla', 'libwww','LibWWW', @@ -1263,16 +1271,10 @@ 'xaudio','mediaplayer', 'xine','mediaplayer', 'xmms','mediaplayer', -# PDA/Phonecell I-Mode browsers -'docomo','pdaphone', -'portalmmm','pdaphone', -# Others (TV) -'webtv','webtv', -# Anonymous Proxy Browsers (can be used as grabbers as well...) -'cjb\.net','cjbnet', # RSS Readers 'abilon', 'abilon', 'aggrevator', 'rss', +'aiderss', 'rss', 'akregator', 'rss', 'applesyndication', 'rss', 'betanews_reader','rss', @@ -1306,13 +1308,40 @@ 'syndirella', 'rss', 'vienna', 'rss', 'wizz\srss\snews\sreader','wizz', +# PDA/Phonecell browsers +#'alcatel','pdaphone', # Alcatel +#'lg\-','pdaphone', # LG +#'ericsson','pdaphone', # Ericsson +#'mot\-','pdaphone', # Motorola +#'nokia','pdaphone', # Nokia +#'panasonic','pdaphone', # Panasonic +#'philips','pdaphone', # Philips +#'sagem','pdaphone', # Sagem +#'samsung','pdaphone', # Samsung +#'sie\-','pdaphone', # SIE +#'sec\-','pdaphone', # Sony/Ericsson +#'sonyericsson','pdaphone', # Sony/Ericsson +#'mmef','pdaphone', +#'mspie','pdaphone', +#'wapalizer','pdaphone', +#'wapsilon','pdaphone', +'wap','pdaphone', # Generic WAP phone (must be after 'wap*') +'up\.','pdaphone', +# PDA/Phonecell browsers +'blackberry','pdaphone', +'docomo','pdaphone', +'iphone','pdaphone', +'portalmmm','pdaphone', +# Others (TV) +'webtv','webtv', +# Anonymous Proxy Browsers (can be used as grabbers as well...) +'cjb\.net','cjbnet', # Other kind of browsers 'apt','apt', 'analogx_proxy','analogx', 'microsoft\-webdav\-miniredir','frontpage', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager','frontpage', 'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav','frontpage', -'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery','frontpage', 'gnome\-vfs', 'gnome', 'neon','neon', 'javaws','java', diff --git a/wwwroot/cgi-bin/lib/operating_systems.pm b/wwwroot/cgi-bin/lib/operating_systems.pm index 5eaded27..300b3f03 100644 --- a/wwwroot/cgi-bin/lib/operating_systems.pm +++ b/wwwroot/cgi-bin/lib/operating_systems.pm @@ -88,13 +88,14 @@ 'palmos', 'syllable', # Miscellanous OS +'blackberry', 'cp/m', 'crayos', 'dreamcast', 'risc[_+ ]?os', 'symbian', 'webtv', -'playstation[_+ ]portable', +'playstation', 'xbox', 'wii', 'vienna', @@ -186,13 +187,14 @@ 'palmos','palmos', 'syllable','syllable', # Miscellanous OS +'blackberry','blackberry', 'cp/m','cp/m', 'crayos','crayos', 'dreamcast','dreamcast', 'risc[_+ ]?os','riscos', 'symbian','symbian', 'webtv','webtv', -'playstation[_+ ]portable', 'psp', +'playstation', 'psp', 'xbox', 'winxbox', 'wii', 'wii' ); @@ -271,13 +273,14 @@ 'palmos','Palm OS', 'syllable','Syllable', # Miscellanous OS +'blackberry','BlackBerry', 'cp/m','CP/M', 'crayos','CrayOS', 'dreamcast','Dreamcast', 'riscos','RISC OS', 'symbian','Symbian OS', 'webtv','WebTV', -'psp', 'Sony PlayStation Portable', +'psp', 'Sony PlayStation', 'wii', 'Nintendo Wii' ); diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index a30db477..e2cb4d5a 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -317,7 +317,6 @@ # Rem: To avoid bad detection, some robot's ids were removed from this list: # - Robots with ID of 3 letters only # - Robots called 'webs' and 'tcl' -# Rem: Some robots mostly used for downloading have also been removed, i.e. wget # Rem: directhit changed into direct_hit (its real id) # Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser # Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser @@ -648,6 +647,7 @@ 'wombat', 'wordpress', 'worm', +'woozweb', 'wwwc', 'wz101', 'xget', @@ -660,6 +660,7 @@ 'aipbot', 'aleadsoftbot', 'alpha_search_agent', +'allrati', 'aport', 'archive\.org_bot', 'argus', # Must be before nutch @@ -670,6 +671,7 @@ 'baiduspider', 'becomebot', 'bender', +'betabot', 'biglotron', 'bittorrent_bot', 'biz360[_+ ]spider', @@ -724,11 +726,13 @@ 'everest\-vulcan', 'ezresult', 'enteprise', +'facebook', 'fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler', # must come before fast enterprise crawler 'fast_enterprise_crawler', 'fast\-search\-engine', +'favicon', 'favorg', 'favorites_sweeper', 'feedburner', @@ -882,6 +886,7 @@ 'sohu', # "sohu agent" 'snappy', 'sphere_scout', +'spip', 'sproose_crawler', 'steeler', 'steroid__download', @@ -959,15 +964,23 @@ 'ng\/1\.', # put at end to avoid false positive 'ng\/2\.', # put at end to avoid false positive 'exabot', # put at end to avoid false positive +# Other id that are 99% of robots +'wget', +'libwww', 'java\/[0-9]' # put at end to avoid false positive ); @RobotsSearchIDOrder_listgen = ( # Generic robot 'robot', +'checker', 'crawl', +'discovery', +'hunter', +'scanner', 'spider', -'bot[+:,\.\;\/\\\-]', -'[+:,\.\;\/\\\-]bot', +'sucker', +'bot[\s_+:,\.\;\/\\\-]', +'[\s_+:,\.\;\/\\\-]bot', 'no_user_agent' ); @@ -1292,6 +1305,7 @@ 'wombat','The Web Wombat', 'wordpress','WordPress', 'worm','The World Wide Web Worm', +'woozweb','Woozweb Monitoring', 'wwwc','WWWC Ver 0.2.5', 'wz101','WebZinger', 'xget','XGET', @@ -1304,6 +1318,7 @@ 'aipbot','aipbot', 'aleadsoftbot','ALeadSoftbot', 'alpha_search_agent','Alpha Search Agent', +'allrati','Allrati', 'aport', 'Aport', 'archive\.org_bot','archive.org bot', 'argus','Argus', @@ -1314,6 +1329,7 @@ 'baiduspider','BaiDuSpider', 'becomebot', 'BecomeBot', 'bender','bender focused_crawler', +'betabot','BetaBot', 'biglotron','Biglotron', 'bittorrent_bot','BitTorrent Bot', 'biz360[_+ ]spider','Biz360 spider', @@ -1367,11 +1383,13 @@ 'everest\-vulcan','Everest-Vulcan', 'ezresult', 'Ezresult', 'enteprise','Fast Enteprise Crawler', +'facebook','FaceBook bot', 'fast\-search\-engine','Fast-Search-Engine (not fastsearch.com)', 'fast_enterprise_crawler','FAST Enterprise Crawler', 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * crawleradmin.t-info@telekom.de', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler','Matrix S.p.A. - FAST Enterprise Crawler', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de', +'favicon','FavIconizer', 'favorg','FavOrg', 'favorites_sweeper','Favorites Sweeper', 'feedburner', 'Feedburner', @@ -1523,6 +1541,7 @@ 'sohu','sohu agent', 'snappy','Snappy', 'sphere_scout','Sphere Scout', +'spip','SPIP', 'sproose_crawler','sproose crawler', 'steroid__download','STEROID Download', 'steeler','Steeler', @@ -1600,13 +1619,21 @@ 'ng\/1\.','NG 1.x (Exalead)', # put at end to avoid false positive 'ng\/2\.','NG 2.x (Exalead)', # put at end to avoid false positive 'exabot','Exabot', # put at end to avoid false positive +# Other id that are 99% of robots +'wget','WGet tools', +'libwww','Perl tool', 'java\/[0-9]','Java (Often spam bot)', # put at end to avoid false positive -# Generic root ID +# Generic robot 'robot', 'Unknown robot (identified by \'robot\')', +'checker', 'Unknown robot (identified by \'checker\')', 'crawl', 'Unknown robot (identified by \'crawl\')', +'discovery', 'Unknown robot (identified by \'discovery\')', +'hunter', 'Unknown robot (identified by \'hunter\')', +'scanner', 'Unknown robot (identified by \'scanner\')', 'spider', 'Unknown robot (identified by \'spider\')', -'bot[+:,\.\;\/\\\-]','Unknown robot (identified by \'bot*\')', -'[+:,\.\;\/\\\-]bot','Unknown robot (identified by \'*bot\')', +'sucker', 'Unknown robot (identified by \'sucker\')', +'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot*\')', +'[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'*bot\')', 'no_user_agent','Unknown robot (identified by empty user agent string)', # Unknown robots identified by hit on robots.txt 'unknown', 'Unknown robot (identified by hit on \'robots.txt\')'