From: eldy <>
Date: Sun, 2 Aug 2009 23:25:26 +0000 (+0000)
Subject: Major update of databases.
X-Git-Tag: AWSTATS_6_95_BETA~7
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d85e4d05ebd3d1d3c9f2ccf6486cdc85a629c9fe;p=thirdparty%2FAWStats.git
Major update of databases.
Increase seriously bot detection.
---
diff --git a/wwwroot/cgi-bin/awstats.pl b/wwwroot/cgi-bin/awstats.pl
index 37fd7ab5..a5a4f13c 100644
--- a/wwwroot/cgi-bin/awstats.pl
+++ b/wwwroot/cgi-bin/awstats.pl
@@ -11566,8 +11566,8 @@ if ( $UpdateStats && $FrameName ne 'index' && $FrameName ne 'mainleft' )
}
}
- # Analyze: Robot from "hit on robots.txt" file (=> countedtraffic=5 if robot)
- # -------------------------------------------------------------------------
+ # Analyze: Robot from "hit on robots.txt" file (=> countedtraffic=5 if robot)
+ # -------------------------------------------------------------------------
if ( !$countedtraffic ) {
if ( $urlwithnoquery =~ /$regrobot/o ) {
if ($Debug) { debug( " It's an unknown robot", 2 ); }
@@ -11685,8 +11685,7 @@ if ( $UpdateStats && $FrameName ne 'index' && $FrameName ne 'mainleft' )
# Do DNS lookup
#--------------
my $Host = $field[$pos_host];
- my $HostResolved = ''
- ; # HostResolved will be defined in next paragraf if countedtraffic is true
+ my $HostResolved = ''; # HostResolved will be defined in next paragraf if countedtraffic is true
if ( !$countedtraffic ) {
my $ip = 0;
@@ -11711,7 +11710,7 @@ if ( $UpdateStats && $FrameName ne 'index' && $FrameName ne 'mainleft' )
}
elsif ( $DNSLookup == 1 ) {
- # Check in session cache (dynamic DNS cache file + session DNS cache)
+ # Check in session cache (dynamic DNS cache file + session DNS cache)
$HostResolved = $TmpDNSLookup{$Host};
if ( !$HostResolved ) {
if ( @SkipDNSLookupFor && &SkipDNSLookup($Host) ) {
diff --git a/wwwroot/cgi-bin/lib/browsers.pm b/wwwroot/cgi-bin/lib/browsers.pm
index 749dc04d..982dcfe0 100644
--- a/wwwroot/cgi-bin/lib/browsers.pm
+++ b/wwwroot/cgi-bin/lib/browsers.pm
@@ -143,6 +143,7 @@
# RSS Readers
'abilon',
'aggrevator',
+'aiderss',
'akregator',
'applesyndication',
'betanews_reader',
@@ -192,13 +193,16 @@
'ericsson', # Ericsson (must be after sonyericsson)
'mmef',
'mspie',
+'vodafone',
'wapalizer',
'wapsilon',
+'wap', # Generic WAP phone (must be after 'wap*')
'webcollage',
'up\.', # Works for UP.Browser and UP.Link
+# PDA/Phonecell browsers
'blackberry',
-# PDA/Phonecell I-Mode browsers
'docomo',
+'iphone',
'portalmmm',
# Others (TV)
'webtv',
@@ -234,9 +238,6 @@
'microsoft\-webdav\-miniredir',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav',
-'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery',
-'microsoft\soffice\sprotocol\sdiscovery',
-'microsoft\soffice\sexistence\sdiscovery',
'POE\-Component\-Client\-HTTP',
'mozilla', # Must be at end because a lot of browsers contains mozilla in string
'libwww', # Must be at end because some browser have both 'browser id' and 'libwww'
@@ -363,6 +364,7 @@
# RSS Readers
'abilon','Abilon (RSS Reader)',
'aggrevator', 'Aggrevator (RSS Reader)',
+'aiderss', 'AideRSS (RSS Reader)',
'akregator','Akregator (RSS Reader)',
'applesyndication','AppleSyndication (RSS Reader)',
'betanews_reader','Betanews Reader (RSS Reader)',
@@ -412,13 +414,16 @@
'ericsson','Ericsson Browser (PDA/Phone browser)', # Must be after SonyEricsson
'mmef','Microsoft Mobile Explorer (PDA/Phone browser)',
'mspie','MS Pocket Internet Explorer (PDA/Phone browser)',
+'vodafone','Vodaphone browser (PDA/Phone browser)',
'wapalizer','WAPalizer (PDA/Phone browser)',
'wapsilon','WAPsilon (PDA/Phone browser)',
+'wap','Unknown WAP browser (PDA/Phone browser)', # Generic WAP phone (must be after 'wap*')
'webcollage','WebCollage (PDA/Phone browser)',
'up\.','UP.Browser (PDA/Phone browser)', # Works for UP.Browser and UP.Link
+# PDA/Phonecell browsers
'blackberry','BlackBerry (PDA/Phone browser)',
-# PDA/Phonecell I-Mode browsers
'docomo','I-Mode phone (PDA/Phone browser)',
+'iphone','IPhone (PDA/Phone browser)',
'portalmmm','I-Mode phone (PDA/Phone browser)',
# Others (TV)
'webtv','WebTV browser',
@@ -454,9 +459,6 @@
'microsoft\-webdav\-miniredir', 'Microsoft Data Access Component Internet Publishing Provider',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager', 'Microsoft Data Access Component Internet Publishing Provider Cache Manager',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav', 'Microsoft Data Access Component Internet Publishing Provider DAV',
-'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery', 'Microsoft Data Access Component Internet Publishing Provider Protocol Discovery',
-'microsoft\soffice\sprotocol\sdiscovery','Microsoft Office Protocol Discovery',
-'microsoft\soffice\sexistence\sdiscovery','Microsoft Office Existence Discovery',
'POE\-Component\-Client\-HTTP','HTTP user-agent for POE (portable networking framework for Perl)',
'mozilla','Mozilla',
'libwww','LibWWW',
@@ -572,36 +574,10 @@
'xaudio','mediaplayer',
'xine','mediaplayer',
'xmms','mediaplayer',
-# PDA/Phonecell browsers
-'alcatel','pdaphone', # Alcatel
-'lg\-','pdaphone', # LG
-'ericsson','pdaphone', # Ericsson
-'mot\-','pdaphone', # Motorola
-'nokia','pdaphone', # Nokia
-'panasonic','pdaphone', # Panasonic
-'philips','pdaphone', # Philips
-'sagem','pdaphone', # Sagem
-'samsung','pdaphone', # Samsung
-'sie\-','pdaphone', # SIE
-'sec\-','pdaphone', # Sony/Ericsson
-'sonyericsson','pdaphone', # Sony/Ericsson
-'mmef','pdaphone',
-'mspie','pdaphone',
-'wapalizer','pdaphone',
-'wapsilon','pdaphone',
-'webcollage','pdaphone',
-'up\.','pdaphone', # Works for UP.Browser and UP.Link
-'blackberry','pdaphone',
-# PDA/Phonecell I-Mode browsers
-'docomo','pdaphone',
-'portalmmm','pdaphone',
-# Others (TV)
-'webtv','webtv',
-# Anonymous Proxy Browsers (can be used as grabbers as well...)
-'cjb\.net','cjbnet',
# RSS Readers
'abilon', 'abilon',
'aggrevator', 'rss',
+'aiderss', 'rss',
'akregator', 'rss',
'applesyndication', 'rss',
'betanews_reader','rss',
@@ -635,6 +611,36 @@
'syndirella', 'rss',
'vienna', 'rss',
'wizz\srss\snews\sreader','wizz',
+# PDA/Phonecell browsers
+'alcatel','pdaphone', # Alcatel
+'lg\-','pdaphone', # LG
+'ericsson','pdaphone', # Ericsson
+'mot\-','pdaphone', # Motorola
+'nokia','pdaphone', # Nokia
+'panasonic','pdaphone', # Panasonic
+'philips','pdaphone', # Philips
+'sagem','pdaphone', # Sagem
+'samsung','pdaphone', # Samsung
+'sie\-','pdaphone', # SIE
+'sec\-','pdaphone', # Sony/Ericsson
+'sonyericsson','pdaphone', # Sony/Ericsson
+'mmef','pdaphone',
+'mspie','pdaphone',
+'vodafone','pdaphone',
+'wapalizer','pdaphone',
+'wapsilon','pdaphone',
+'wap','pdaphone', # Generic WAP phone (must be after 'wap*')
+'webcollage','pdaphone',
+'up\.','pdaphone', # Works for UP.Browser and UP.Link
+# PDA/Phonecell browsers
+'blackberry','pdaphone',
+'docomo','pdaphone',
+'iphone','pdaphone',
+'portalmmm','pdaphone',
+# Others (TV)
+'webtv','webtv',
+# Anonymous Proxy Browsers (can be used as grabbers as well...)
+'cjb\.net','cjbnet',
# Other kind of browsers
'apt','apt',
'analogx_proxy','analogx',
diff --git a/wwwroot/cgi-bin/lib/browsers_phone.pm b/wwwroot/cgi-bin/lib/browsers_phone.pm
index 8576d187..1294fc87 100644
--- a/wwwroot/cgi-bin/lib/browsers_phone.pm
+++ b/wwwroot/cgi-bin/lib/browsers_phone.pm
@@ -117,7 +117,8 @@
'xmms',
# RSS Readers
'abilon',
-'aggrevator',
+'aggrevator',
+'aiderss',
'akregator',
'applesyndication',
'betanews_reader',
@@ -556,11 +557,15 @@
'n21i',
'n22i',
'ts21i',
-# PDA/Phonecell I-Mode browsers
+'wap', # Generic WAP phone (must be after 'wap*')
+'up\.', # Works for UP.Browser and UP.Link
+# PDA/Phonecell browsers
+'blackberry',
+'cnf2',
'docomo',
-'portalmmm',
'ipcheck',
-'cnf2',
+'iphone',
+'portalmmm',
# Others (TV)
'webtv',
# Anonymous Proxy Browsers (can be used as grabbers as well...)
@@ -589,7 +594,6 @@
'microsoft\-webdav\-miniredir',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav',
-'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery',
'POE\-Component\-Client\-HTTP',
'mozilla', # Must be at end because a lot of browsers contains mozilla in string
'libwww', # Must be at end because some browser have both 'browser id' and 'libwww'
@@ -692,6 +696,7 @@
# RSS Readers
'abilon','Abilon (RSS Reader)',
'aggrevator', 'Aggrevator (RSS Reader)',
+'aiderss', 'AideRSS (RSS Reader)',
'akregator','Akregator (RSS Reader)',
'applesyndication','AppleSyndication (RSS Reader)',
'betanews_reader','Betanews Reader (RSS Reader)',
@@ -1130,11 +1135,15 @@
'n21i','I-Mode Nec 21i (phone)',
'n22i','I-Mode Nec 22i (phone)',
'ts21i','I-Mode Toshiba 21i (phone)',
-# PDA/Phonecell I-Mode browsers
+'wap','Unknown WAP browser (PDA/Phone browser)', # Generic WAP phone (must be after 'wap*')
+'up\.','UP.Browser (PDA/Phone browser)', # Works for UP.Browser and UP.Link
+# PDA/Phonecell browsers
+'blackberry','BlackBerry (PDA/Phone browser)',
+'cnf2','Supervision I-Mode ByTel (phone)',
'docomo','I-Mode phone (PDA/Phone browser)',
'portalmmm','I-Mode phone (PDA/Phone browser)',
'ipcheck','Supervision IP Check (phone)',
-'cnf2','Supervision I-Mode ByTel (phone)',
+'iphone','IPhone (PDA/Phone browser)',
# Others (TV)
'webtv','WebTV browser',
# Anonymous Proxy Browsers (can be used as grabbers as well...)
@@ -1163,7 +1172,6 @@
'microsoft\-webdav\-miniredir', 'Microsoft Data Access Component Internet Publishing Provider',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager', 'Microsoft Data Access Component Internet Publishing Provider Cache Manager',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav', 'Microsoft Data Access Component Internet Publishing Provider DAV',
-'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery', 'Microsoft Data Access Component Internet Publishing Provider Protocol Discovery',
'POE\-Component\-Client\-HTTP','HTTP user-agent for POE (portable networking framework for Perl)',
'mozilla','Mozilla',
'libwww','LibWWW',
@@ -1263,16 +1271,10 @@
'xaudio','mediaplayer',
'xine','mediaplayer',
'xmms','mediaplayer',
-# PDA/Phonecell I-Mode browsers
-'docomo','pdaphone',
-'portalmmm','pdaphone',
-# Others (TV)
-'webtv','webtv',
-# Anonymous Proxy Browsers (can be used as grabbers as well...)
-'cjb\.net','cjbnet',
# RSS Readers
'abilon', 'abilon',
'aggrevator', 'rss',
+'aiderss', 'rss',
'akregator', 'rss',
'applesyndication', 'rss',
'betanews_reader','rss',
@@ -1306,13 +1308,40 @@
'syndirella', 'rss',
'vienna', 'rss',
'wizz\srss\snews\sreader','wizz',
+# PDA/Phonecell browsers
+#'alcatel','pdaphone', # Alcatel
+#'lg\-','pdaphone', # LG
+#'ericsson','pdaphone', # Ericsson
+#'mot\-','pdaphone', # Motorola
+#'nokia','pdaphone', # Nokia
+#'panasonic','pdaphone', # Panasonic
+#'philips','pdaphone', # Philips
+#'sagem','pdaphone', # Sagem
+#'samsung','pdaphone', # Samsung
+#'sie\-','pdaphone', # SIE
+#'sec\-','pdaphone', # Sony/Ericsson
+#'sonyericsson','pdaphone', # Sony/Ericsson
+#'mmef','pdaphone',
+#'mspie','pdaphone',
+#'wapalizer','pdaphone',
+#'wapsilon','pdaphone',
+'wap','pdaphone', # Generic WAP phone (must be after 'wap*')
+'up\.','pdaphone',
+# PDA/Phonecell browsers
+'blackberry','pdaphone',
+'docomo','pdaphone',
+'iphone','pdaphone',
+'portalmmm','pdaphone',
+# Others (TV)
+'webtv','webtv',
+# Anonymous Proxy Browsers (can be used as grabbers as well...)
+'cjb\.net','cjbnet',
# Other kind of browsers
'apt','apt',
'analogx_proxy','analogx',
'microsoft\-webdav\-miniredir','frontpage',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\scache\smanager','frontpage',
'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sdav','frontpage',
-'microsoft\sdata\saccess\sinternet\spublishing\sprovider\sprotocol\sdiscovery','frontpage',
'gnome\-vfs', 'gnome',
'neon','neon',
'javaws','java',
diff --git a/wwwroot/cgi-bin/lib/operating_systems.pm b/wwwroot/cgi-bin/lib/operating_systems.pm
index 5eaded27..300b3f03 100644
--- a/wwwroot/cgi-bin/lib/operating_systems.pm
+++ b/wwwroot/cgi-bin/lib/operating_systems.pm
@@ -88,13 +88,14 @@
'palmos',
'syllable',
# Miscellanous OS
+'blackberry',
'cp/m',
'crayos',
'dreamcast',
'risc[_+ ]?os',
'symbian',
'webtv',
-'playstation[_+ ]portable',
+'playstation',
'xbox',
'wii',
'vienna',
@@ -186,13 +187,14 @@
'palmos','palmos',
'syllable','syllable',
# Miscellanous OS
+'blackberry','blackberry',
'cp/m','cp/m',
'crayos','crayos',
'dreamcast','dreamcast',
'risc[_+ ]?os','riscos',
'symbian','symbian',
'webtv','webtv',
-'playstation[_+ ]portable', 'psp',
+'playstation', 'psp',
'xbox', 'winxbox',
'wii', 'wii'
);
@@ -271,13 +273,14 @@
'palmos','Palm OS',
'syllable','Syllable',
# Miscellanous OS
+'blackberry','BlackBerry',
'cp/m','CP/M',
'crayos','CrayOS',
'dreamcast','Dreamcast',
'riscos','RISC OS',
'symbian','Symbian OS',
'webtv','WebTV',
-'psp', 'Sony PlayStation Portable',
+'psp', 'Sony PlayStation',
'wii', 'Nintendo Wii'
);
diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm
index a30db477..e2cb4d5a 100644
--- a/wwwroot/cgi-bin/lib/robots.pm
+++ b/wwwroot/cgi-bin/lib/robots.pm
@@ -317,7 +317,6 @@
# Rem: To avoid bad detection, some robot's ids were removed from this list:
# - Robots with ID of 3 letters only
# - Robots called 'webs' and 'tcl'
-# Rem: Some robots mostly used for downloading have also been removed, i.e. wget
# Rem: directhit changed into direct_hit (its real id)
# Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser
# Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser
@@ -648,6 +647,7 @@
'wombat',
'wordpress',
'worm',
+'woozweb',
'wwwc',
'wz101',
'xget',
@@ -660,6 +660,7 @@
'aipbot',
'aleadsoftbot',
'alpha_search_agent',
+'allrati',
'aport',
'archive\.org_bot',
'argus', # Must be before nutch
@@ -670,6 +671,7 @@
'baiduspider',
'becomebot',
'bender',
+'betabot',
'biglotron',
'bittorrent_bot',
'biz360[_+ ]spider',
@@ -724,11 +726,13 @@
'everest\-vulcan',
'ezresult',
'enteprise',
+'facebook',
'fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de',
'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de',
'matrix_s\.p\.a\._\-_fast_enterprise_crawler', # must come before fast enterprise crawler
'fast_enterprise_crawler',
'fast\-search\-engine',
+'favicon',
'favorg',
'favorites_sweeper',
'feedburner',
@@ -882,6 +886,7 @@
'sohu', # "sohu agent"
'snappy',
'sphere_scout',
+'spip',
'sproose_crawler',
'steeler',
'steroid__download',
@@ -959,15 +964,23 @@
'ng\/1\.', # put at end to avoid false positive
'ng\/2\.', # put at end to avoid false positive
'exabot', # put at end to avoid false positive
+# Other id that are 99% of robots
+'wget',
+'libwww',
'java\/[0-9]' # put at end to avoid false positive
);
@RobotsSearchIDOrder_listgen = (
# Generic robot
'robot',
+'checker',
'crawl',
+'discovery',
+'hunter',
+'scanner',
'spider',
-'bot[+:,\.\;\/\\\-]',
-'[+:,\.\;\/\\\-]bot',
+'sucker',
+'bot[\s_+:,\.\;\/\\\-]',
+'[\s_+:,\.\;\/\\\-]bot',
'no_user_agent'
);
@@ -1292,6 +1305,7 @@
'wombat','The Web Wombat',
'wordpress','WordPress',
'worm','The World Wide Web Worm',
+'woozweb','Woozweb Monitoring',
'wwwc','WWWC Ver 0.2.5',
'wz101','WebZinger',
'xget','XGET',
@@ -1304,6 +1318,7 @@
'aipbot','aipbot',
'aleadsoftbot','ALeadSoftbot',
'alpha_search_agent','Alpha Search Agent',
+'allrati','Allrati',
'aport', 'Aport',
'archive\.org_bot','archive.org bot',
'argus','Argus',
@@ -1314,6 +1329,7 @@
'baiduspider','BaiDuSpider',
'becomebot', 'BecomeBot',
'bender','bender focused_crawler',
+'betabot','BetaBot',
'biglotron','Biglotron',
'bittorrent_bot','BitTorrent Bot',
'biz360[_+ ]spider','Biz360 spider',
@@ -1367,11 +1383,13 @@
'everest\-vulcan','Everest-Vulcan',
'ezresult', 'Ezresult',
'enteprise','Fast Enteprise Crawler',
+'facebook','FaceBook bot',
'fast\-search\-engine','Fast-Search-Engine (not fastsearch.com)',
'fast_enterprise_crawler','FAST Enterprise Crawler',
'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * crawleradmin.t-info@telekom.de',
'matrix_s\.p\.a\._\-_fast_enterprise_crawler','Matrix S.p.A. - FAST Enterprise Crawler',
'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de',
+'favicon','FavIconizer',
'favorg','FavOrg',
'favorites_sweeper','Favorites Sweeper',
'feedburner', 'Feedburner',
@@ -1523,6 +1541,7 @@
'sohu','sohu agent',
'snappy','Snappy',
'sphere_scout','Sphere Scout',
+'spip','SPIP',
'sproose_crawler','sproose crawler',
'steroid__download','STEROID Download',
'steeler','Steeler',
@@ -1600,13 +1619,21 @@
'ng\/1\.','NG 1.x (Exalead)', # put at end to avoid false positive
'ng\/2\.','NG 2.x (Exalead)', # put at end to avoid false positive
'exabot','Exabot', # put at end to avoid false positive
+# Other id that are 99% of robots
+'wget','WGet tools',
+'libwww','Perl tool',
'java\/[0-9]','Java (Often spam bot)', # put at end to avoid false positive
-# Generic root ID
+# Generic robot
'robot', 'Unknown robot (identified by \'robot\')',
+'checker', 'Unknown robot (identified by \'checker\')',
'crawl', 'Unknown robot (identified by \'crawl\')',
+'discovery', 'Unknown robot (identified by \'discovery\')',
+'hunter', 'Unknown robot (identified by \'hunter\')',
+'scanner', 'Unknown robot (identified by \'scanner\')',
'spider', 'Unknown robot (identified by \'spider\')',
-'bot[+:,\.\;\/\\\-]','Unknown robot (identified by \'bot*\')',
-'[+:,\.\;\/\\\-]bot','Unknown robot (identified by \'*bot\')',
+'sucker', 'Unknown robot (identified by \'sucker\')',
+'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot*\')',
+'[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'*bot\')',
'no_user_agent','Unknown robot (identified by empty user agent string)',
# Unknown robots identified by hit on robots.txt
'unknown', 'Unknown robot (identified by hit on \'robots.txt\')'