From: eldy <> Date: Sun, 18 Apr 2004 18:15:18 +0000 (+0000) Subject: Updated robot database from http://www.robotstxt.org/wc/active/all.txt X-Git-Tag: AWSTATS_6_1_BETA~22 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=70bb78ab59d13d724cd947ea32d515333098bc34;p=thirdparty%2FAWStats.git Updated robot database from http://www.robotstxt.org/wc/active/all.txt --- diff --git a/wwwroot/cgi-bin/awstats.pl b/wwwroot/cgi-bin/awstats.pl index da01bff6..c3173ce3 100644 --- a/wwwroot/cgi-bin/awstats.pl +++ b/wwwroot/cgi-bin/awstats.pl @@ -269,7 +269,7 @@ use vars qw/ %DomainsHashIDLib %MimeHashLib %MimeHashIcon %MimeHashFamily %OSHashID %OSHashLib -%RobotsHashIDLib +%RobotsHashIDLib %RobotsAffiliateLib %SearchEnginesHashID %SearchEnginesHashLib %SearchEnginesKnownUrl %NotSearchEnginesKeys %WormsHashID %WormsHashLib %WormsHashTarget /; diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index a3bae643..c0220fa5 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -12,14 +12,15 @@ # Robots list was found at http://www.robotstxt.org/wc/active/all.txt # Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html # Rem: To avoid bad detection, some robots id were removed from this list: -# - Robots with ID of 2 letters only +# - Robots with ID of 3 letters only # - Robot called 'webs' and 'tcl' +# Rem: Some robot most used for download are also remode: wget # Rem: directhit changed into direct_hit (its real id) # Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser # Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser # Rem: roadrunner changed into road_runner # Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser - +# Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser # RobotsSearchIDOrder # It contains all matching criteria to search for in log fields. This list is @@ -29,44 +30,52 @@ # Note: Robots IDs are in lower case, ' ' and '+' are changed into '_' and are quoted. #------------------------------------------------------- @RobotsSearchIDOrder_list1 = ( -'antibot', +# Common robots (In robot file) 'appie', 'architext', +'askjeeves', 'bjaaland', -'digout4u', -'echo', -'fast\-webcrawler', 'ferret', 'googlebot', 'gulliver', 'harvest', 'htdig', -'ia_archiver', -'jeeves', -'jennybot', 'linkwalker', 'lycos_', -'mercator', 'moget', 'muscatferret', 'myweb', -'netcraft', 'nomad', -'petersnews', 'scooter', 'slurp', +'^voyager\/', +'weblayers', +# Common robots (Not in robot file) +'antibot', +'digout4u', +'echo', +'fast\-webcrawler', +'ia_archiver', +'jennybot', +'mercator', +'netcraft', +'petersnews', 'unlost_web_crawler', 'voila', -'^voyager\/', # Add ^ and \/ to avoid to exclude voyager and amigavoyager browser 'webbase', -'weblayers', 'wisenutbot' ); @RobotsSearchIDOrder_list2 = ( +# Less common robots (In robot file) +'[^a]fish', +'abcdatos', 'acme\.spider', 'ahoythehomepagefinder', 'alkaline', +'anthill', 'arachnophilia', +'arale', +'araneo', 'aretha', 'ariadne', 'arks', @@ -75,10 +84,12 @@ 'atomz', 'auresys', 'backrub', +'bbot', 'bigbrother', 'blackwidow', 'blindekuh', 'bloodhound', +'borg\-bot', 'brightnet', 'bspider', 'cactvschemistryspider', @@ -86,8 +97,9 @@ 'cassandra', 'cgireader', 'checkbot', +'christcrawler', 'churl', -'cmc', +'cienciaficcion', 'collective', 'combine', 'conceptbot', @@ -97,31 +109,30 @@ 'cruiser', 'cusco', 'cyberspyder', +'desertrealm', 'deweb', 'dienstspider', 'digger', 'diibot', -'directhit', +'direct_hit', 'dnabot', 'download_express', 'dragonbot', 'dwcp', 'e\-collector', 'ebiness', -'eit', 'elfinbot', 'emacs', 'emcspider', 'esther', 'evliyacelebi', -'nzexplorer', +'fastcrawler', 'fdse', 'felix', 'fetchrover', 'fido', 'finnish', 'fireball', -'[^a]fish', 'fouineur', 'francoroute', 'freecrawl', @@ -135,13 +146,13 @@ 'grapnel', 'griffon', 'gromit', +'gulperbot', 'hambot', 'havindex', 'hometown', 'htmlgobble', 'hyperdecontextualizer', 'iajabot', -'ibm', 'iconoclast', 'ilse', 'imagelock', @@ -163,8 +174,8 @@ 'joebot', 'jubii', 'jumpstation', +'kapsi', 'katipo', -'kdd', 'kilroy', 'ko_yappo_robot', 'labelgrabber\.txt', @@ -182,16 +193,23 @@ 'merzscope', 'meshexplorer', 'mindcrawler', +'mnogosearch', 'momspider', 'monster', 'motor', +'msnbot', +'muncher', 'mwdsearch', +'ndspider', +'nederland\.zoek', 'netcarta', 'netmechanic', 'netscoop', 'newscan\-online', 'nhse', 'northstar', +'nzexplorer', +'objectssearch', 'occam', 'octopus', 'openfind', @@ -204,16 +222,16 @@ 'perignator', 'perlcrawler', 'phantom', +'phpdig', 'piltdownman', 'pimptrain', 'pioneer', 'pitkow', 'pjspider', -'pka', 'plumtreewebaccessor', 'poppi', 'portalb', -'puu', +'psbot', 'python', 'raven', 'rbse', @@ -222,11 +240,13 @@ 'road_runner', 'robbie', 'robi', +'robocrawl', 'robofox', 'robozilla', 'roverbot', 'rules', 'safetynetrobot', +'search\-info', 'search_au', 'searchprocess', 'senrigan', @@ -236,13 +256,12 @@ 'sift', 'simbot', 'site\-valet', -'sitegrabber', 'sitetech', +'skymob', 'slcrawler', 'smartspider', 'snooper', 'solbot', -'spanner', 'speedy', 'spider_monkey', 'spiderbot', @@ -259,9 +278,8 @@ 'tarspider', 'techbot', 'templeton', -'teoma_agent1', -'titin', 'titan', +'titin', 'tkwww', 'tlspider', 'ucsd', @@ -271,6 +289,7 @@ 'verticrawl', 'victoria', 'visionsearch', +'voidbot', 'vwbot', 'w3index', 'w3m2', @@ -282,6 +301,7 @@ 'webcopy', 'webfetcher', 'webfoot', +'webinator', 'weblinker', 'webmirror', 'webmoose', @@ -327,12 +347,9 @@ 'linkchecker', 'microsoft_url_control', 'msiecrawler', -'msnbot', 'nagios', -'nederland\.zoek', 'perman', 'pompos', -'psbot', 'rambler', 'redalert', 'shoutcast', @@ -344,6 +361,7 @@ 'ultraseek', 'webclipping\.com', 'webcompass', +'wonderer', 'yahoo\-verticalcrawler', 'yandex', 'zealbot', @@ -362,12 +380,50 @@ # List of robots names ('robot id','robot clear text') #------------------------------------------------------- %RobotsHashIDLib = ( +# Common robots (In robot file) +'appie','Walhello appie', +'architext','ArchitextSpider', +'askjeeves','AskJeeves', +'bjaaland','Bjaaland', +'ferret','Wild Ferret Web Hopper #1, #2, #3', +'googlebot','Googlebot', +'gulliver','Northern Light Gulliver', +'harvest','Harvest', +'htdig','ht://Dig', +'linkwalker','LinkWalker', +'lycos_','Lycos', +'moget','moget', +'muscatferret','Muscat Ferret', +'myweb','Internet Shinchakubin', +'nomad','Nomad', +'scooter','Scooter', +'slurp','Inktomi Slurp', +'^voyager\/','Voyager', +'weblayers','weblayers', +# Common robots (Not in robot file) +'antibot','Antibot', +'digout4u','Digout4u', +'echo','EchO!', +'fast\-webcrawler','Fast-Webcrawler', +'ia_archiver','Alexa (IA Archiver)', +'jennybot','JennyBot', +'mercator','Mercator', +'netcraft','Netcraft', +'petersnews','Petersnews', +'unlost_web_crawler','Unlost Web Crawler', +'voila','Voila', +'webbase', 'WebBase', +'wisenutbot','WISENutbot', +# Less common robots (In robot file) +'[^a]fish','Fish search', +'abcdatos','ABCdatos BotLink', 'acme\.spider','Acme.Spider', 'ahoythehomepagefinder','Ahoy! The Homepage Finder', 'alkaline','Alkaline', -'appie','Walhello appie', +'anthill','Anthill', 'arachnophilia','Arachnophilia', -'architext','ArchitextSpider', +'arale','Arale', +'araneo','Araneo', 'aretha','Aretha', 'ariadne','ARIADNE', 'arks','arks', @@ -376,11 +432,12 @@ 'atomz','Atomz.com Search Robot', 'auresys','AURESYS', 'backrub','BackRub', +'bbot','BBot', 'bigbrother','Big Brother', -'bjaaland','Bjaaland', 'blackwidow','BlackWidow', 'blindekuh','Die Blinde Kuh', 'bloodhound','Bloodhound', +'borg\-bot','Borg-Bot', 'brightnet','bright.net caching robot', 'bspider','BSpider', 'cactvschemistryspider','CACTVS Chemistry Spider', @@ -388,8 +445,9 @@ 'cassandra','Cassandra', 'cgireader','Digimarc Marcspider/CGI', 'checkbot','Checkbot', +'christcrawler','ChristCrawler.com', 'churl','churl', -'cmc','CMC/0.01', +'cienciaficcion','cIeNcIaFiCcIoN.nEt', 'collective','Collective', 'combine','Combine System', 'conceptbot','Conceptbot', @@ -399,32 +457,30 @@ 'cruiser','Internet Cruiser Robot', 'cusco','Cusco', 'cyberspyder','CyberSpyder Link Test', +'desertrealm','Desert Realm Spider', 'deweb','DeWeb(c) Katalog/Index', 'dienstspider','DienstSpider', 'digger','Digger', 'diibot','Digital Integrity Robot', -'directhit','Direct Hit Grabber', +'direct_hit','Direct Hit Grabber', 'dnabot','DNAbot', 'download_express','DownLoad Express', 'dragonbot','DragonBot', 'dwcp','DWCP (Dridus\' Web Cataloging Project)', 'e\-collector','e-collector', 'ebiness','EbiNess', -'eit','EIT Link Verifier Robot', 'elfinbot','ELFINBOT', 'emacs','Emacs-w3 Search Engine', 'emcspider','ananzi', 'esther','Esther', 'evliyacelebi','Evliya Celebi', -'nzexplorer','nzexplorer', +'fastcrawler','FastCrawler', 'fdse','Fluid Dynamics Search Engine robot', 'felix','Felix IDE', -'ferret','Wild Ferret Web Hopper #1, #2, #3', 'fetchrover','FetchRover', 'fido','fido', 'finnish','Hämähäkki', 'fireball','KIT-Fireball', -'[^a]fish','Fish search', 'fouineur','Fouineur', 'francoroute','Robot Francoroute', 'freecrawl','Freecrawl', @@ -435,20 +491,16 @@ 'getbot','GetBot', 'geturl','GetURL', 'golem','Golem', -'googlebot','Googlebot (Google)', 'grapnel','Grapnel/0.01 Experiment', 'griffon','Griffon', 'gromit','Gromit', -'gulliver','Northern Light Gulliver', +'gulperbot','Gulper Bot', 'hambot','HamBot', -'harvest','Harvest', 'havindex','havIndex', 'hometown','Hometown Spider Pro', -'htdig','ht://Dig', 'htmlgobble','HTMLgobble', 'hyperdecontextualizer','Hyper-Decontextualizer', 'iajabot','iajaBot', -'ibm','IBM_Planetwide', 'iconoclast','Popular Iconoclast', 'ilse','Ingrid', 'imagelock','Imagelock', @@ -465,14 +517,13 @@ 'javabee','JavaBee', 'jbot','JBot Java Web Robot', 'jcrawler','JCrawler', -'jeeves','Jeeves', 'jobo','JoBo Java Web Robot', 'jobot','Jobot', 'joebot','JoeBot', 'jubii','The Jubii Indexing Robot', 'jumpstation','JumpStation', +'kapsi','image.kapsi.net', 'katipo','Katipo', -'kdd','KDD-Explorer', 'kilroy','Kilroy', 'ko_yappo_robot','KO_Yappo_Robot', 'labelgrabber\.txt','LabelGrabber', @@ -480,10 +531,8 @@ 'legs','legs', 'linkidator','Link Validator', 'linkscan','LinkScan', -'linkwalker','LinkWalker', 'lockon','Lockon', 'logo_gif','logo.gif Crawler', -'lycos_','Lycos', 'macworm','Mac WWWWorm', 'magpie','Magpie', 'marvin','marvin/infoseek', @@ -492,22 +541,23 @@ 'merzscope','MerzScope', 'meshexplorer','NEC-MeshExplorer', 'mindcrawler','MindCrawler', -'moget','moget', +'mnogosearch','mnoGoSearch search engine software', 'momspider','MOMspider', 'monster','Monster', 'motor','Motor', -'muscatferret','Muscat Ferret', +'msnbot','MSNBot', +'muncher','Muncher', 'mwdsearch','Mwd.Search', -'myweb','Internet Shinchakubin', -'nagios','Nagios monitoring checker', +'ndspider','NDSpider', +'nederland\.zoek','Nederland.zoek', 'netcarta','NetCarta WebMap Engine', -'netcraft','Netcraft Web Server Survey', 'netmechanic','NetMechanic', 'netscoop','NetScoop', 'newscan\-online','newscan-online', 'nhse','NHSE Web Forager', -'nomad','Nomad', 'northstar','The NorthStar Robot', +'nzexplorer','nzexplorer', +'objectssearch','ObjectsSearch', 'occam','Occam', 'octopus','HKU WWW Octopus', 'openfind','Openfind data gatherer', @@ -520,16 +570,16 @@ 'perignator','The Peregrinator', 'perlcrawler','PerlCrawler 1.0', 'phantom','Phantom', +'phpdig','PhpDig', 'piltdownman','PiltdownMan', 'pimptrain','Pimptrain.com\'s robot', 'pioneer','Pioneer', 'pitkow','html_analyzer', 'pjspider','Portal Juice Spider', -'pka','PGP Key Agent', 'plumtreewebaccessor','PlumtreeWebAccessor', 'poppi','Poppi', 'portalb','PortalB Spider', -'puu','GetterroboPlus Puu', +'psbot','psbot', 'python','The Python Robot', 'raven','Raven Search', 'rbse','RBSE Spider', @@ -538,12 +588,13 @@ 'road_runner','Road Runner: The ImageScape Robot', 'robbie','Robbie the Robot', 'robi','ComputingSite Robi/1.0', +'robocrawl','RoboCrawl Spider', 'robofox','RoboFox', 'robozilla','Robozilla', 'roverbot','Roverbot', 'rules','RuLeS', 'safetynetrobot','SafetyNet Robot', -'scooter','Scooter (AltaVista)', +'search\-info','Sleek', 'search_au','Search.Aus-AU.COM', 'searchprocess','SearchProcess', 'senrigan','Senrigan', @@ -553,14 +604,12 @@ 'sift','Sift', 'simbot','Simmany Robot Ver1.0', 'site\-valet','Site Valet', -'sitegrabber','Open Text Index Robot', 'sitetech','SiteTech-Rover', +'skymob','Skymob.com', 'slcrawler','SLCrawler', -'slurp','Inktomi Slurp', 'smartspider','Smart Spider', 'snooper','Snooper', 'solbot','Solbot', -'spanner','Spanner', 'speedy','Speedy Spider', 'spider_monkey','spider_monkey', 'spiderbot','SpiderBot', @@ -577,9 +626,8 @@ 'tarspider','tarspider', 'techbot','TechBOT', 'templeton','Templeton', -'teoma_agent1','TeomaTechnologies', -'titin','TitIn', 'titan','TITAN', +'titin','TitIn', 'tkwww','The TkWWW Robot', 'tlspider','TLSpider', 'ucsd','UCSD Crawl', @@ -589,19 +637,19 @@ 'verticrawl','Verticrawl', 'victoria','Victoria', 'visionsearch','vision-search', -'^voyager\/','Voyager', +'voidbot','void-bot', 'vwbot','VWbot', 'w3index','The NWI Robot', 'w3m2','W3M2', -'wallpaper','WallPaper', +'wallpaper','WallPaper (alias crawlpaper)', 'wanderer','the World Wide Web Wanderer', 'wapspider','w@pSpider by wap4.com', 'webbandit','WebBandit Web Spider', 'webcatcher','WebCatcher', 'webcopy','WebCopy', -'webfetcher','Webfetcher', +'webfetcher','webfetcher', 'webfoot','The Webfoot Robot', -'weblayers','Weblayers', +'webinator','Webinator', 'weblinker','WebLinker', 'webmirror','WebMirror', 'webmoose','The Web Moose', @@ -624,9 +672,7 @@ 'wwwc','WWWC Ver 0.2.5', 'wz101','WebZinger', 'xget','XGET', -'nederland\.zoek','Nederland.zoek', # Other robots reported by users -'antibot', 'Antibot', 'aport', 'Aport', 'awbot', 'AWBot', 'baiduspider','BaiDuSpider', @@ -635,31 +681,23 @@ 'bumblebee', 'Bumblebee (relevare.com)', 'cscrawler','CsCrawler', 'daviesbot', 'DaviesBot', -'digout4u', 'Digout4u', -'echo', 'EchO!', 'exactseek','ExactSeek Crawler', 'ezresult', 'Ezresult', -'fast\-webcrawler', 'Fast-Webcrawler (AllTheWeb)', 'gigabot','GigaBot', 'gnodspider','GNOD Spider', 'grub','Grub.org', 'henrythemiragorobot', 'Mirago', 'holmes', 'Holmes', -'ia_archiver', 'Alexa (IA Archiver)', 'internetseer', 'InternetSeer', -'jennybot', 'JennyBot', 'justview', 'JustView', 'linkbot','LinkBot', 'linkchecker','LinkChecker', 'metager\-linkchecker','MetaGer LinkChecker', 'microsoft_url_control','Microsoft URL Control', -'mercator', 'Mercator', +'nagios','Nagios', 'msiecrawler','MSIECrawler', -'msnbot','MSNBot', 'perman', 'Perman surfer', -'petersnews', 'Petersnews', 'pompos','Pompos', -'psbot','psBot', 'rambler', 'StackRambler', 'redalert', 'Red Alert', 'shoutcast','Shoutcast Directory Service', @@ -669,16 +707,13 @@ 'turtle', 'Turtle', 'turtlescanner', 'Turtle', 'ultraseek', 'Ultraseek', -'unlost_web_crawler', 'Unlost Web Crawler', -'voila', 'Voila', -'webbase', 'WebBase', -'webcompass', 'webcompass', 'webclipping\.com', 'WebClipping.com', -'wisenutbot','WISENutbot (Looksmart)', +'webcompass', 'webcompass', +'wonderer', 'Web Wombat Redback Spider', 'yahoo\-verticalcrawler', 'Yahoo Vertical Crawler', 'yandex', 'Yandex bot', 'zealbot','ZealBot', -'zyborg','Zyborg (Looksmart)', +'zyborg','Zyborg', # Generic root ID 'robot', 'Unknown robot (identified by \'robot\')', @@ -690,4 +725,19 @@ ); +# RobotsAffiliateLib +# This list try to tell by which Search Engine a robot is used +#------------------------------------------------------------- +%RobotsAffiliateLib = ( +'fast\-webcrawler'=>'AllTheWeb', +'googlebot'=>'Google', +'msnbot'=>'MSN', +'scooter'=>'AltaVista', +'wisenutbot'=>'Looksmart', +'yahoo\-verticalcrawler'=>'Yahoo', +'zyborg'=>'Looksmart' +); + + + 1;