# Robots list was found at http://www.robotstxt.org/wc/active/all.txt
# Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html
# Rem: To avoid bad detection, some robots id were removed from this list:
-# - Robots with ID of 2 letters only
+# - Robots with ID of 3 letters only
# - Robot called 'webs' and 'tcl'
+# Rem: Some robot most used for download are also remode: wget
# Rem: directhit changed into direct_hit (its real id)
# Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser
# Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser
# Rem: roadrunner changed into road_runner
# Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser
-
+# Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser
# RobotsSearchIDOrder
# It contains all matching criteria to search for in log fields. This list is
# Note: Robots IDs are in lower case, ' ' and '+' are changed into '_' and are quoted.
#-------------------------------------------------------
@RobotsSearchIDOrder_list1 = (
-'antibot',
+# Common robots (In robot file)
'appie',
'architext',
+'askjeeves',
'bjaaland',
-'digout4u',
-'echo',
-'fast\-webcrawler',
'ferret',
'googlebot',
'gulliver',
'harvest',
'htdig',
-'ia_archiver',
-'jeeves',
-'jennybot',
'linkwalker',
'lycos_',
-'mercator',
'moget',
'muscatferret',
'myweb',
-'netcraft',
'nomad',
-'petersnews',
'scooter',
'slurp',
+'^voyager\/',
+'weblayers',
+# Common robots (Not in robot file)
+'antibot',
+'digout4u',
+'echo',
+'fast\-webcrawler',
+'ia_archiver',
+'jennybot',
+'mercator',
+'netcraft',
+'petersnews',
'unlost_web_crawler',
'voila',
-'^voyager\/', # Add ^ and \/ to avoid to exclude voyager and amigavoyager browser
'webbase',
-'weblayers',
'wisenutbot'
);
@RobotsSearchIDOrder_list2 = (
+# Less common robots (In robot file)
+'[^a]fish',
+'abcdatos',
'acme\.spider',
'ahoythehomepagefinder',
'alkaline',
+'anthill',
'arachnophilia',
+'arale',
+'araneo',
'aretha',
'ariadne',
'arks',
'atomz',
'auresys',
'backrub',
+'bbot',
'bigbrother',
'blackwidow',
'blindekuh',
'bloodhound',
+'borg\-bot',
'brightnet',
'bspider',
'cactvschemistryspider',
'cassandra',
'cgireader',
'checkbot',
+'christcrawler',
'churl',
-'cmc',
+'cienciaficcion',
'collective',
'combine',
'conceptbot',
'cruiser',
'cusco',
'cyberspyder',
+'desertrealm',
'deweb',
'dienstspider',
'digger',
'diibot',
-'directhit',
+'direct_hit',
'dnabot',
'download_express',
'dragonbot',
'dwcp',
'e\-collector',
'ebiness',
-'eit',
'elfinbot',
'emacs',
'emcspider',
'esther',
'evliyacelebi',
-'nzexplorer',
+'fastcrawler',
'fdse',
'felix',
'fetchrover',
'fido',
'finnish',
'fireball',
-'[^a]fish',
'fouineur',
'francoroute',
'freecrawl',
'grapnel',
'griffon',
'gromit',
+'gulperbot',
'hambot',
'havindex',
'hometown',
'htmlgobble',
'hyperdecontextualizer',
'iajabot',
-'ibm',
'iconoclast',
'ilse',
'imagelock',
'joebot',
'jubii',
'jumpstation',
+'kapsi',
'katipo',
-'kdd',
'kilroy',
'ko_yappo_robot',
'labelgrabber\.txt',
'merzscope',
'meshexplorer',
'mindcrawler',
+'mnogosearch',
'momspider',
'monster',
'motor',
+'msnbot',
+'muncher',
'mwdsearch',
+'ndspider',
+'nederland\.zoek',
'netcarta',
'netmechanic',
'netscoop',
'newscan\-online',
'nhse',
'northstar',
+'nzexplorer',
+'objectssearch',
'occam',
'octopus',
'openfind',
'perignator',
'perlcrawler',
'phantom',
+'phpdig',
'piltdownman',
'pimptrain',
'pioneer',
'pitkow',
'pjspider',
-'pka',
'plumtreewebaccessor',
'poppi',
'portalb',
-'puu',
+'psbot',
'python',
'raven',
'rbse',
'road_runner',
'robbie',
'robi',
+'robocrawl',
'robofox',
'robozilla',
'roverbot',
'rules',
'safetynetrobot',
+'search\-info',
'search_au',
'searchprocess',
'senrigan',
'sift',
'simbot',
'site\-valet',
-'sitegrabber',
'sitetech',
+'skymob',
'slcrawler',
'smartspider',
'snooper',
'solbot',
-'spanner',
'speedy',
'spider_monkey',
'spiderbot',
'tarspider',
'techbot',
'templeton',
-'teoma_agent1',
-'titin',
'titan',
+'titin',
'tkwww',
'tlspider',
'ucsd',
'verticrawl',
'victoria',
'visionsearch',
+'voidbot',
'vwbot',
'w3index',
'w3m2',
'webcopy',
'webfetcher',
'webfoot',
+'webinator',
'weblinker',
'webmirror',
'webmoose',
'linkchecker',
'microsoft_url_control',
'msiecrawler',
-'msnbot',
'nagios',
-'nederland\.zoek',
'perman',
'pompos',
-'psbot',
'rambler',
'redalert',
'shoutcast',
'ultraseek',
'webclipping\.com',
'webcompass',
+'wonderer',
'yahoo\-verticalcrawler',
'yandex',
'zealbot',
# List of robots names ('robot id','robot clear text')
#-------------------------------------------------------
%RobotsHashIDLib = (
+# Common robots (In robot file)
+'appie','Walhello appie',
+'architext','ArchitextSpider',
+'askjeeves','AskJeeves',
+'bjaaland','Bjaaland',
+'ferret','Wild Ferret Web Hopper #1, #2, #3',
+'googlebot','Googlebot',
+'gulliver','Northern Light Gulliver',
+'harvest','Harvest',
+'htdig','ht://Dig',
+'linkwalker','LinkWalker',
+'lycos_','Lycos',
+'moget','moget',
+'muscatferret','Muscat Ferret',
+'myweb','Internet Shinchakubin',
+'nomad','Nomad',
+'scooter','Scooter',
+'slurp','Inktomi Slurp',
+'^voyager\/','Voyager',
+'weblayers','weblayers',
+# Common robots (Not in robot file)
+'antibot','Antibot',
+'digout4u','Digout4u',
+'echo','EchO!',
+'fast\-webcrawler','Fast-Webcrawler',
+'ia_archiver','Alexa (IA Archiver)',
+'jennybot','JennyBot',
+'mercator','Mercator',
+'netcraft','Netcraft',
+'petersnews','Petersnews',
+'unlost_web_crawler','Unlost Web Crawler',
+'voila','Voila',
+'webbase', 'WebBase',
+'wisenutbot','WISENutbot',
+# Less common robots (In robot file)
+'[^a]fish','Fish search',
+'abcdatos','ABCdatos BotLink',
'acme\.spider','Acme.Spider',
'ahoythehomepagefinder','Ahoy! The Homepage Finder',
'alkaline','Alkaline',
-'appie','Walhello appie',
+'anthill','Anthill',
'arachnophilia','Arachnophilia',
-'architext','ArchitextSpider',
+'arale','Arale',
+'araneo','Araneo',
'aretha','Aretha',
'ariadne','ARIADNE',
'arks','arks',
'atomz','Atomz.com Search Robot',
'auresys','AURESYS',
'backrub','BackRub',
+'bbot','BBot',
'bigbrother','Big Brother',
-'bjaaland','Bjaaland',
'blackwidow','BlackWidow',
'blindekuh','Die Blinde Kuh',
'bloodhound','Bloodhound',
+'borg\-bot','Borg-Bot',
'brightnet','bright.net caching robot',
'bspider','BSpider',
'cactvschemistryspider','CACTVS Chemistry Spider',
'cassandra','Cassandra',
'cgireader','Digimarc Marcspider/CGI',
'checkbot','Checkbot',
+'christcrawler','ChristCrawler.com',
'churl','churl',
-'cmc','CMC/0.01',
+'cienciaficcion','cIeNcIaFiCcIoN.nEt',
'collective','Collective',
'combine','Combine System',
'conceptbot','Conceptbot',
'cruiser','Internet Cruiser Robot',
'cusco','Cusco',
'cyberspyder','CyberSpyder Link Test',
+'desertrealm','Desert Realm Spider',
'deweb','DeWeb(c) Katalog/Index',
'dienstspider','DienstSpider',
'digger','Digger',
'diibot','Digital Integrity Robot',
-'directhit','Direct Hit Grabber',
+'direct_hit','Direct Hit Grabber',
'dnabot','DNAbot',
'download_express','DownLoad Express',
'dragonbot','DragonBot',
'dwcp','DWCP (Dridus\' Web Cataloging Project)',
'e\-collector','e-collector',
'ebiness','EbiNess',
-'eit','EIT Link Verifier Robot',
'elfinbot','ELFINBOT',
'emacs','Emacs-w3 Search Engine',
'emcspider','ananzi',
'esther','Esther',
'evliyacelebi','Evliya Celebi',
-'nzexplorer','nzexplorer',
+'fastcrawler','FastCrawler',
'fdse','Fluid Dynamics Search Engine robot',
'felix','Felix IDE',
-'ferret','Wild Ferret Web Hopper #1, #2, #3',
'fetchrover','FetchRover',
'fido','fido',
'finnish','Hämähäkki',
'fireball','KIT-Fireball',
-'[^a]fish','Fish search',
'fouineur','Fouineur',
'francoroute','Robot Francoroute',
'freecrawl','Freecrawl',
'getbot','GetBot',
'geturl','GetURL',
'golem','Golem',
-'googlebot','Googlebot (Google)',
'grapnel','Grapnel/0.01 Experiment',
'griffon','Griffon',
'gromit','Gromit',
-'gulliver','Northern Light Gulliver',
+'gulperbot','Gulper Bot',
'hambot','HamBot',
-'harvest','Harvest',
'havindex','havIndex',
'hometown','Hometown Spider Pro',
-'htdig','ht://Dig',
'htmlgobble','HTMLgobble',
'hyperdecontextualizer','Hyper-Decontextualizer',
'iajabot','iajaBot',
-'ibm','IBM_Planetwide',
'iconoclast','Popular Iconoclast',
'ilse','Ingrid',
'imagelock','Imagelock',
'javabee','JavaBee',
'jbot','JBot Java Web Robot',
'jcrawler','JCrawler',
-'jeeves','Jeeves',
'jobo','JoBo Java Web Robot',
'jobot','Jobot',
'joebot','JoeBot',
'jubii','The Jubii Indexing Robot',
'jumpstation','JumpStation',
+'kapsi','image.kapsi.net',
'katipo','Katipo',
-'kdd','KDD-Explorer',
'kilroy','Kilroy',
'ko_yappo_robot','KO_Yappo_Robot',
'labelgrabber\.txt','LabelGrabber',
'legs','legs',
'linkidator','Link Validator',
'linkscan','LinkScan',
-'linkwalker','LinkWalker',
'lockon','Lockon',
'logo_gif','logo.gif Crawler',
-'lycos_','Lycos',
'macworm','Mac WWWWorm',
'magpie','Magpie',
'marvin','marvin/infoseek',
'merzscope','MerzScope',
'meshexplorer','NEC-MeshExplorer',
'mindcrawler','MindCrawler',
-'moget','moget',
+'mnogosearch','mnoGoSearch search engine software',
'momspider','MOMspider',
'monster','Monster',
'motor','Motor',
-'muscatferret','Muscat Ferret',
+'msnbot','MSNBot',
+'muncher','Muncher',
'mwdsearch','Mwd.Search',
-'myweb','Internet Shinchakubin',
-'nagios','Nagios monitoring checker',
+'ndspider','NDSpider',
+'nederland\.zoek','Nederland.zoek',
'netcarta','NetCarta WebMap Engine',
-'netcraft','Netcraft Web Server Survey',
'netmechanic','NetMechanic',
'netscoop','NetScoop',
'newscan\-online','newscan-online',
'nhse','NHSE Web Forager',
-'nomad','Nomad',
'northstar','The NorthStar Robot',
+'nzexplorer','nzexplorer',
+'objectssearch','ObjectsSearch',
'occam','Occam',
'octopus','HKU WWW Octopus',
'openfind','Openfind data gatherer',
'perignator','The Peregrinator',
'perlcrawler','PerlCrawler 1.0',
'phantom','Phantom',
+'phpdig','PhpDig',
'piltdownman','PiltdownMan',
'pimptrain','Pimptrain.com\'s robot',
'pioneer','Pioneer',
'pitkow','html_analyzer',
'pjspider','Portal Juice Spider',
-'pka','PGP Key Agent',
'plumtreewebaccessor','PlumtreeWebAccessor',
'poppi','Poppi',
'portalb','PortalB Spider',
-'puu','GetterroboPlus Puu',
+'psbot','psbot',
'python','The Python Robot',
'raven','Raven Search',
'rbse','RBSE Spider',
'road_runner','Road Runner: The ImageScape Robot',
'robbie','Robbie the Robot',
'robi','ComputingSite Robi/1.0',
+'robocrawl','RoboCrawl Spider',
'robofox','RoboFox',
'robozilla','Robozilla',
'roverbot','Roverbot',
'rules','RuLeS',
'safetynetrobot','SafetyNet Robot',
-'scooter','Scooter (AltaVista)',
+'search\-info','Sleek',
'search_au','Search.Aus-AU.COM',
'searchprocess','SearchProcess',
'senrigan','Senrigan',
'sift','Sift',
'simbot','Simmany Robot Ver1.0',
'site\-valet','Site Valet',
-'sitegrabber','Open Text Index Robot',
'sitetech','SiteTech-Rover',
+'skymob','Skymob.com',
'slcrawler','SLCrawler',
-'slurp','Inktomi Slurp',
'smartspider','Smart Spider',
'snooper','Snooper',
'solbot','Solbot',
-'spanner','Spanner',
'speedy','Speedy Spider',
'spider_monkey','spider_monkey',
'spiderbot','SpiderBot',
'tarspider','tarspider',
'techbot','TechBOT',
'templeton','Templeton',
-'teoma_agent1','TeomaTechnologies',
-'titin','TitIn',
'titan','TITAN',
+'titin','TitIn',
'tkwww','The TkWWW Robot',
'tlspider','TLSpider',
'ucsd','UCSD Crawl',
'verticrawl','Verticrawl',
'victoria','Victoria',
'visionsearch','vision-search',
-'^voyager\/','Voyager',
+'voidbot','void-bot',
'vwbot','VWbot',
'w3index','The NWI Robot',
'w3m2','W3M2',
-'wallpaper','WallPaper',
+'wallpaper','WallPaper (alias crawlpaper)',
'wanderer','the World Wide Web Wanderer',
'wapspider','w@pSpider by wap4.com',
'webbandit','WebBandit Web Spider',
'webcatcher','WebCatcher',
'webcopy','WebCopy',
-'webfetcher','Webfetcher',
+'webfetcher','webfetcher',
'webfoot','The Webfoot Robot',
-'weblayers','Weblayers',
+'webinator','Webinator',
'weblinker','WebLinker',
'webmirror','WebMirror',
'webmoose','The Web Moose',
'wwwc','WWWC Ver 0.2.5',
'wz101','WebZinger',
'xget','XGET',
-'nederland\.zoek','Nederland.zoek',
# Other robots reported by users
-'antibot', 'Antibot',
'aport', 'Aport',
'awbot', 'AWBot',
'baiduspider','BaiDuSpider',
'bumblebee', 'Bumblebee (relevare.com)',
'cscrawler','CsCrawler',
'daviesbot', 'DaviesBot',
-'digout4u', 'Digout4u',
-'echo', 'EchO!',
'exactseek','ExactSeek Crawler',
'ezresult', 'Ezresult',
-'fast\-webcrawler', 'Fast-Webcrawler (AllTheWeb)',
'gigabot','GigaBot',
'gnodspider','GNOD Spider',
'grub','Grub.org',
'henrythemiragorobot', 'Mirago',
'holmes', 'Holmes',
-'ia_archiver', 'Alexa (IA Archiver)',
'internetseer', 'InternetSeer',
-'jennybot', 'JennyBot',
'justview', 'JustView',
'linkbot','LinkBot',
'linkchecker','LinkChecker',
'metager\-linkchecker','MetaGer LinkChecker',
'microsoft_url_control','Microsoft URL Control',
-'mercator', 'Mercator',
+'nagios','Nagios',
'msiecrawler','MSIECrawler',
-'msnbot','MSNBot',
'perman', 'Perman surfer',
-'petersnews', 'Petersnews',
'pompos','Pompos',
-'psbot','psBot',
'rambler', 'StackRambler',
'redalert', 'Red Alert',
'shoutcast','Shoutcast Directory Service',
'turtle', 'Turtle',
'turtlescanner', 'Turtle',
'ultraseek', 'Ultraseek',
-'unlost_web_crawler', 'Unlost Web Crawler',
-'voila', 'Voila',
-'webbase', 'WebBase',
-'webcompass', 'webcompass',
'webclipping\.com', 'WebClipping.com',
-'wisenutbot','WISENutbot (Looksmart)',
+'webcompass', 'webcompass',
+'wonderer', 'Web Wombat Redback Spider',
'yahoo\-verticalcrawler', 'Yahoo Vertical Crawler',
'yandex', 'Yandex bot',
'zealbot','ZealBot',
-'zyborg','Zyborg (Looksmart)',
+'zyborg','Zyborg',
# Generic root ID
'robot', 'Unknown robot (identified by \'robot\')',
);
+# RobotsAffiliateLib
+# This list try to tell by which Search Engine a robot is used
+#-------------------------------------------------------------
+%RobotsAffiliateLib = (
+'fast\-webcrawler'=>'AllTheWeb',
+'googlebot'=>'Google',
+'msnbot'=>'MSN',
+'scooter'=>'AltaVista',
+'wisenutbot'=>'Looksmart',
+'yahoo\-verticalcrawler'=>'Yahoo',
+'zyborg'=>'Looksmart'
+);
+
+
+
1;