'ng\/1\.', # put at end to avoid false positive
'ng\/2\.', # put at end to avoid false positive
'exabot', # put at end to avoid false positive
+# Additional bots found by Sussex.
+'^[1-3]$', # Hiding bots. Doesn't appear to be a valid user agent.
+'alltop',
+'applesyndication',
+'asynchttpclient',
+'bingbot',
+'blogged_crawl',
+'bloglovin',
+'butterfly',
+'buzztracker',
+'carpathia',
+'catbot',
+'chattertrap',
+'check_http', #(nagios) a monitoring tool
+'coldfusion',
+'covario',
+'daylifefeedfetcher',
+'discobot',
+'dlvr\.it',
+'dreamwidth',
+'drupal',
+'ezoom',
+'feedmyinbox',
+'feedroll\.com',
+'feedzira',
+'fever\/',
+'freenews',
+'geohasher',
+'hanrss',
+'inagist',
+'jacobin club',
+'jakarta',
+'js\-kit',
+'largesmall crawler',
+'linkedinbot',
+'longurl',
+'metauri',
+'microsoft\-webdav\-miniredir',
+'^motorola$',
+'movabletype',
+# These appear to be bots trying to hide. All of the usual architecture data is missing.
+'^mozilla\/3\.0 \(compatible$',
+'^mozilla\/4\.0$',
+'^mozilla\/4\.0 \(compatible;\)$',
+'^mozilla\/5\.0$',
+'^mozilla\/5\.0 \(compatible;$',
+'^mozilla\/5\.0 \(en\-us\)$',
+'^mozilla\/5\.0 firefox\/3\.0\.5$',
+'^msie',
+# End of hiding bots.
+'netnewswire',
+' netseer ',
+'netvibes',
+'newrelicpinger',
+'newsfox',
+'nextgensearchbot',
+'ning',
+'pingdom',
+'pita',
+'postpost',
+'postrank',
+'printfulbot',
+'protopage',
+'proximic',
+'quipply',
+'r6\_',
+'ratingburner',
+'regator',
+'rome client',
+'rpt\-httpclient',
+'rssgraffiti',
+'sage\+\+',
+'scoutjet',
+'simplepie',
+'sitebot',
+'summify\.com',
+'superfeedr',
+'synthesio',
+'teoma',
+'topblogsinfo',
+'topix\.net',
+'trapit',
+'trileet',
+'tweetedtimes',
+'twisted pagegetter',
+'twitterbot',
+'twitterfeed',
+'unwindfetchor',
+'wazzup',
+'windows\-rss\-platform',
+'wiumi',
+'xydo',
+'yahoo! slurp',
+'yahoo pipes',
+'yahoo\-newscrawler',
+'yahoocachesystem',
+'yahooexternalcache',
+'yahoo! searchmonkey',
+'yahooysmcm',
+'yammer',
+'yandexbot',
+'yeti',
+'yie8',
+'youdao',
+'yourls',
+'zemanta',
+'zend_http_client',
# Other id that are 99% of robots
'wget',
'libwww',
-'java\/[0-9]' # put at end to avoid false positive
+'^java\/[0-9]' # put at end to avoid false positive
);
@RobotsSearchIDOrder_listgen = (
# Generic robot
'sucker',
'bot[\s_+:,\.\;\/\\\-]',
'[\s_+:,\.\;\/\\\-]bot',
+'curl',
+'php',
+'ruby\/',
'no_user_agent'
);
# Other id that are 99% of robots
'wget','WGet tools',
'libwww','Perl tool',
-'java\/[0-9]','<a href="http://www.projecthoneypot.org/harvester_useragents.php" title="Bot home page [new window]" target="_blank">Java (Often spam bot)</a>', # put at end to avoid false positive
+'^java\/[0-9]','<a href="http://www.projecthoneypot.org/harvester_useragents.php" title="Bot home page [new window]" target="_blank">Java (Often spam bot)</a>', # put at end to avoid false positive
# Generic robot
'robot', 'Unknown robot (identified by \'robot\')',
'checker', 'Unknown robot (identified by \'checker\')',
'sucker', 'Unknown robot (identified by \'sucker\')',
'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot*\')',
'[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'*bot\')',
+'curl', 'Common *nix tool for automating web document retireval. Most likely a bot.',
+'php', 'A PHP script',
+'ruby\/', 'Ruby script',
+# Additional bots found by Sussex.
+'^[1-3]$', 'Generic bot identified as "1", "2" or "3"',
+'alltop', 'alltop',
+'applesyndication', 'applesyndication',
+'asynchttpclient', 'asynchttpclient',
+'bingbot', 'bingbot',
+'blogged_crawl', 'blogged_crawl',
+'bloglovin', 'bloglovin',
+'butterfly', 'butterfly',
+'buzztracker', 'buzztracker',
+'carpathia', 'carpathia',
+'catbot', 'catbot',
+'chattertrap', 'chattertrap',
+'check_http', 'check_http (nagios)',
+'coldfusion', 'coldfusion',
+'covario', 'covario',
+'daylifefeedfetcher', 'daylifefeedfetcher',
+'discobot', 'discobot',
+'dlvr\.it', 'dlvr.it',
+'dreamwidth', 'dreamwidth',
+'drupal', 'Drupal Site',
+'ezoom', 'ezoom',
+'feedmyinbox', 'feedmyinbox',
+'feedroll\.com', 'feedroll.com',
+'feedzira', 'feedzira',
+'fever\/', '<a href="http://feedafever.com">Feed a Fever</a>',
+'freenews', 'freenews',
+'geohasher', 'geohasher',
+'hanrss', 'hanrss',
+'inagist', 'inagist',
+'jacobin club', 'jacobin club',
+'jakarta', 'jakarta',
+'js\-kit', 'js-kit',
+'largesmall crawler', 'largesmall crawler',
+'linkedinbot', 'linkedinbot',
+'longurl', 'longurl',
+'metauri', 'metauri',
+'microsoft\-webdav\-miniredir', 'microsoft-webdav-miniredir',
+'^motorola$', 'Suspected Bot masquerading as "Motorola"',
+'movabletype', 'movabletype',
+'^mozilla\/3\.0 \(compatible$', 'Suspected bot masqurading as Mozilla',
+'^mozilla\/4\.0$', 'Suspected bot masqurading as Mozilla',
+'^mozilla\/4\.0 \(compatible;\)$', 'Suspected bot masqurading as Mozilla',
+'^mozilla\/5\.0$', 'Suspected bot masqurading as Mozilla',
+'^mozilla\/5\.0 \(compatible;$', 'Suspected bot masqurading as Mozilla',
+'^mozilla\/5\.0 \(en\-us\)$', 'Suspected bot masqurading as Mozilla',
+'^mozilla\/5\.0 firefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla',
+'^msie', 'Suspected bot masquerading as M$ IE',
+'netnewswire', 'netnewswire',
+' netseer ', '<a href="http://www.netseer.com/crawler.html">Net Seer</a>',
+'netvibes', 'netvibes',
+'newrelicpinger', 'newrelicpinger',
+'newsfox', 'Fox News',
+'nextgensearchbot', 'nextgensearchbot',
+'ning', 'ning',
+'pingdom', 'pingdom',
+'pita', 'pita (pain in the ass?)',
+'postpost', 'postpost',
+'postrank', 'postrank',
+'printfulbot', 'printfulbot',
+'protopage', 'protopage',
+'proximic', 'proximic',
+'quipply', 'quipply',
+'r6\_', '<a href="http://www.radian6.com/crawler">Radian 6 Crawler</a>',
+'ratingburner', 'ratingburner',
+'regator', 'regator',
+'rome client', 'rome client',
+'rpt\-httpclient', 'rpt-httpclient',
+'rssgraffiti', 'rssgraffiti',
+'sage\+\+', 'sage++',
+'scoutjet', '<a href="http://wwww.scoutjet.com/" target="_blank">ScoutJet</a> crawler for <a href="http://blekko.com/" target="_blank">Blekko</a>.',
+'simplepie', 'simplepie',
+'sitebot', 'sitebot',
+'summify\.com', '<a href="http://summify.com/">summify.com</a>',
+'superfeedr', 'superfeedr',
+'synthesio', 'synthesio',
+'teoma', 'teoma',
+'topblogsinfo', 'topblogsinfo',
+'topix\.net', 'topix.net',
+'trapit', 'trapit',
+'trileet', 'trileet',
+'tweetedtimes', '<a href="http://tweetedtimes.com">The Tweeted Times</a>',
+'twisted pagegetter', 'twisted pagegetter',
+'twitterbot', 'twitterbot',
+'twitterfeed', 'twitterfeed',
+'unwindfetchor', 'unwindfetchor',
+'wazzup', 'wazzup',
+'windows\-rss\-platform', 'windows-rss-platform',
+'wiumi', 'wiumi',
+'xydo', 'xydo',
+'yahoo! slurp', 'Additional Yahoo bots.',
+'yahoo pipes', 'Additional Yahoo bots.',
+'yahoo\-newscrawler', 'Additional Yahoo bots.',
+'yahoocachesystem', 'Additional Yahoo bots.',
+'yahooexternalcache', 'Additional Yahoo bots.',
+'yahoo! searchmonkey', 'Additional Yahoo bots.',
+'yahooysmcm', 'Additional Yahoo bots.',
+'yammer', 'yammer',
+'yandexbot', 'yandexbot',
+'yeti', 'yeti',
+'yie8', 'yie8',
+'youdao', 'youdao',
+'yourls', 'yourls',
+'zemanta', 'zemanta',
+'zend_http_client', 'Zend Http Client',
'no_user_agent','Unknown robot (identified by empty user agent string)',
# Unknown robots identified by hit on robots.txt
'unknown', 'Unknown robot (identified by hit on \'robots.txt\')'
'yahoo!_mindset'=>'Yahoo',
'zyborg'=>'Looksmart',
'cfetch'=>'Kosmix',
-'^voyager\/'=>'Kosmix'
++'^voyager\/'=>'Kosmix',
++# Additional bots found by Sussex.
++'feedfetcher\-google'=>'Google',
++'bingbot'=>'MSN',
++'twitterbot'=>'Twitter',
++'twitterfeed'=>'Twitter',
++'yahoo! slurp'=>'Yahoo',
++'yahoo pipes'=>'Yahoo',
++'yahoo-newscrawler'=>'Yahoo',
++'yahoocachesystem'=>'Yahoo',
++'yahooexternalcache'=>'Yahoo',
++'yahoo! searchmonkey'=>'Yahoo',
++'yahooysmcm'=>'Yahoo',
);
1;