From: visualperception Date: Tue, 13 Mar 2018 23:19:00 +0000 (+0000) Subject: Robots Detection Modifications X-Git-Tag: AWSTATS_7_8~21^2~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cd1c4fb73c5f76bd22f7306ef6c0313a65177721;p=thirdparty%2FAWStats.git Robots Detection Modifications added 37 new robots to robots.pm file using v 7.7 robots.pm file as base file. --- diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index 8e01472d..5dc2aa54 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -43,6 +43,10 @@ # #------------------------------------------------------- +# 2018-03-13 RobC +# Added 36 robots and one generic ( survey ) using v 7.7 robots file as base. +# Also moved robot "Obot" into generics so that it is singled out as an individual Robot. +# # 2016-09-02 RobC # Fixed a few errors and added a few missing bots from awstats 7.5 release. # @@ -394,6 +398,7 @@ # Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+\s]' and are quoted. #------------------------------------------------------- + @RobotsSearchIDOrder_list1 = ( # Common robots (In robot file) 'bingbot/', @@ -407,6 +412,7 @@ 'Google[\x20]Web[\x20]Preview', 'Googlebot\-Image/', 'Googlebot\-Mobile/', +'Google[\x20]Page[\x20]Speed', 'google\-sitemaps', 'Googlebot\-News', 'Googlebot\-Video/', @@ -441,16 +447,23 @@ 'AdnormCrawlerCatchBot/', 'aiHitBot/', 'aipbot/', +'AlphaBot', 'Apache\-HttpClient/', 'Apexoo[\x20]Spider', 'Applebot/', 'archive\.org_bot', 'Babya[\x20]Discoverer', +'Barkrowler', 'BDCbot/', +'BellPagesCA/', +'BeNosey[\x20]Mohawk[\x20]Search', +'bhcBot', +'BigBozz/', 'BinGet/', 'bl\.uk_lddc_bot/', 'BLEXBot/', 'boitho\.com\-dc/', +'BoogleBot', 'BusinessBot:', 'CatchBot/', 'CB/Nutch', @@ -467,10 +480,14 @@ 'Curl/PHP', 'Dalvik/', 'DataCrawler/', +'daumoa', +'daum', 'Deepnet[\x20]Explorer', 'DeuSu/', 'Digincore', 'Discordbot/', +'Dispatch/', +'DnyzBot', 'DoCoMo/', 'Domain[\x20]Re\-Animator[\x20]Bot', 'DomainCrawler/', @@ -494,7 +511,9 @@ 'findlinks/', 'Findxbot/', 'FirePHP/', +'firstdirectory\-bot', 'FlippyBearBot/', +'^foo$', 'FreeWebMonitoring[\x20]SiteChecker/', 'fujilabol', 'FurlBot/', @@ -507,18 +526,24 @@ 'Girafabot', 'Gluten[\x20]Free[\x20]Crawler/', 'gocrawl', +'Go\-http\-client/', 'GrapeshotCrawler/', 'GSiteCrawler/', 'GurujiBot/', +'hadiBot', 'HaosouSpider', +'HELLO[\x20]Crawler', 'holmes/', +'houzzbot', 'HTTP_Request2/', 'HubSpot[\x20]Webcrawler', 'HyperCrawl/', 'ICC\-Crawler/', 'iconoclast', 'IDGCrawler/Nutch', +'IDG/UK', 'idmarch[\x20]Automatic\.beta/', +'InbyBot', 'Incutio[\x20]XML', 'InfluenceBot', 'IRLbot/', @@ -527,6 +552,7 @@ 'James[\x20]BOT', 'Jigsaw/', 'JobFeed', +'Jooblebot', 'KomodiaBot/', 'Konqueror/', 'linkapediabot', @@ -541,6 +567,7 @@ 'LWP::Simple/', 'Mail\.RU_Bot/', 'meanpathbot/', +'Mechanize', 'Mediatoolkitbot', 'MegaIndex\.ru/', 'merzscope', @@ -557,25 +584,29 @@ 'NerdyBot', 'netEstate[\x20]NE[\x20]Crawler', 'NetResearchServer/', +'nominet', 'NRLCorpusBuilder/Nutch', 'nutch\-1\.4/', 'nutch\-1\.8/', 'NutchCVS/', 'o\.uk[\x20]robot', -'oBot/', 'ocrawler;', 'ODP[\x20]link[\x20]checker', 'Offline[\x20]Explorer/', 'OmniExplorer_Bot/', 'OrangeBot/', 'PageBitesHyperBot/', +'Pcore', 'pdffillerbot/', +'peopleman', 'PhantomJS', 'PHP/5\.2\.8', +'Pinterestbot', 'Ploetz[\x20]\+[\x20]Zeller', 'Plukkie/', 'Princetonbot/', 'PrivacyAwareBot/', +'Prlog/', 'proximic', 'psbot/', 'psbot\-image', @@ -592,9 +623,12 @@ 'SafeSearch[\x20]microdata[\x20]crawler', 'safesearch', 'SBL\-BOT', +'scrapy', 'Screaming[\x20]Frog[\x20]SEO[\x20]Spider/', 'ScreenerBot[\x20]Crawler[\x20]Beta', +'Scrubby', 'Searchie/', +'SecurityResearch\.bot', 'Seekmo', 'semanticbot', 'SemrushBot/', @@ -608,11 +642,14 @@ 'Shim\-Crawler', 'SiteExplorer/', 'siteexplorer\.info', +'siteimprove', 'Slackbot\-LinkExpanding', 'SmabblerBot/', 'Sogou[\x20]web[\x20]spider/', 'special_archiver/', 'Spiderbot/', +'SpuhexBot', +'spyonweb', 'ssearch_bot', 'SurdotlyBot/', 'SurveyBot/', @@ -634,6 +671,7 @@ 'vBSEO', 'vBulletin[\x20]via[\x20]PHP', 'vebidoobot', +'vegi[\x20]bot', 'viz/Nutch', 'VoilaBot', 'VORTEX/', @@ -641,6 +679,7 @@ 'W3C_Validator/', 'W3C\-checklink/', 'WBSearchBot/', +'WbSrch/', 'WeSEE:Ads/PageBot', 'WeSEE:Ads/PictureBot', 'WeSEE_Bot', @@ -658,6 +697,7 @@ 'Yahoo[\x20]Link[\x20]Preview', 'YisouSpider', 'yoozBot', +'Your\-Website\-Sucks', 'zspider/', 'ZumBot/', # below placed at end to catch some generics @@ -696,6 +736,7 @@ 'MSIE[\x20]4', 'MSIE[\x20]5', 'MSIE[\x20]6', +'MSIE\+6\.0\;', 'Windows[\x20]95', 'Windows[\x20]98', @@ -816,7 +857,6 @@ 'cyberspyder', 'datafountains/dmoz_downloader', 'dataprovider\.com', -'daumoa', 'daviesbot', 'daylifefeedfetcher', 'daypopbot', @@ -1455,6 +1495,7 @@ @RobotsSearchIDOrder_listgen = ( # Generic robot 'robot', +'oBot/', 'blog', 'checker', 'crawl', @@ -1468,6 +1509,7 @@ 'sitemap', 'spider', 'sucker', +'survey', 'validator', 'bot[\s_+:,\.\;\/\\\-]', '[\s_+:,\.\;\/\\\-]bot', @@ -1494,6 +1536,7 @@ 'Google[\x20]Web[\x20]Preview','Google Web Preview', 'Googlebot\-Image/','Googlebot-Image', 'Googlebot\-Mobile/','Googlebot-Mobile', +'Google[\x20]Page[\x20]Speed','Google Page Speed', 'google\-sitemaps','google-sitemaps', 'Googlebot\-News','Googlebot-News', 'Googlebot\-Video/','Googlebot-Video', @@ -1528,16 +1571,23 @@ 'AdnormCrawlerCatchBot/','AdnormCrawlerCatchBot', 'aiHitBot/','aiHitBot', 'aipbot/','aipbot', +'AlphaBot','AlphaBot', 'Apache\-HttpClient/','Apache-HttpClient', 'Apexoo[\x20]Spider','Apexoo Spider', 'Applebot/','Applebot', 'archive\.org_bot','archive.org_bot', 'Babya[\x20]Discoverer','Babya Discoverer', +'Barkrowler','Barkrowler', 'BDCbot/','BDCbot', +'BellPagesCA/','BellPagesCA', +'BeNosey[\x20]Mohawk[\x20]Search','BeNosey Mohawk Search', +'bhcBot','bhcBot', +'BigBozz/','BigBozz', 'BinGet/','BinGet', 'bl\.uk_lddc_bot/','bl.uk_lddc_bot', 'BLEXBot/','BLEXBot', 'boitho\.com\-dc/','boitho.com-dc', +'BoogleBot','BoogleBot', 'BusinessBot:','BusinessBot:', 'CatchBot/','CatchBot', 'CB/Nutch','CB/Nutch', @@ -1554,10 +1604,14 @@ 'Curl/PHP','Curl/PHP', 'Dalvik/','Dalvik', 'DataCrawler/','DataCrawler', +'daumoa','daumoa', +'daum','daum', 'Deepnet[\x20]Explorer','Deepnet Explorer', 'DeuSu/','DeuSu', 'Digincore','Digincore', 'Discordbot/','Discordbot', +'Dispatch/','Dispatch', +'DnyzBot','DnyzBot', 'DoCoMo/','DoCoMo', 'Domain[\x20]Re\-Animator[\x20]Bot','Domain Re-Animator Bot', 'DomainCrawler/','DomainCrawler', @@ -1581,7 +1635,9 @@ 'findlinks/','findlinks', 'Findxbot/','Findxbot', 'FirePHP/','FirePHP', +'firstdirectory\-bot','firstdirectory-bot', 'FlippyBearBot/','FlippyBearBot', +'^foo$','foo', 'FreeWebMonitoring[\x20]SiteChecker/','FreeWebMonitoring SiteChecker', 'fujilabol','fujilabol', 'FurlBot/','FurlBot', @@ -1594,18 +1650,24 @@ 'Girafabot','Girafabot', 'Gluten[\x20]Free[\x20]Crawler/','Gluten Free Crawler', 'gocrawl','gocrawl', +'Go\-http\-client/','Go-http-client', 'GrapeshotCrawler/','GrapeshotCrawler', 'GSiteCrawler/','GSiteCrawler', 'GurujiBot/','GurujiBot', +'hadiBot','hadiBot', 'HaosouSpider','HaosouSpider', +'HELLO[\x20]Crawler','HELLO Crawler', 'holmes/','holmes', +'houzzbot','houzzbot', 'HTTP_Request2/','HTTP_Request2', 'HubSpot[\x20]Webcrawler','HubSpot Webcrawler', 'HyperCrawl/','HyperCrawl', 'ICC\-Crawler/','ICC-Crawler', 'iconoclast','iconoclast', 'IDGCrawler/Nutch','IDGCrawler/Nutch', +'IDG/UK','IDG/UK', 'idmarch[\x20]Automatic\.beta/','idmarch Automatic.beta', +'InbyBot','InbyBot', 'Incutio[\x20]XML','Incutio XML', 'InfluenceBot','InfluenceBot', 'IRLbot/','IRLbot', @@ -1614,6 +1676,7 @@ 'James[\x20]BOT','James BOT', 'Jigsaw/','Jigsaw', 'JobFeed','JobFeed', +'Jooblebot','Jooblebot', 'KomodiaBot/','KomodiaBot', 'Konqueror/','Konqueror', 'linkapediabot','linkapediabot', @@ -1628,6 +1691,7 @@ 'LWP::Simple/','LWP::Simple', 'Mail\.RU_Bot/','Mail.RU Bot', 'meanpathbot/','meanpathbot', +'Mechanize','Mechanize', 'Mediatoolkitbot','Mediatoolkitbot', 'MegaIndex\.ru/','MegaIndex.ru', 'merzscope','merzscope', @@ -1644,25 +1708,29 @@ 'NerdyBot','NerdyBot', 'netEstate[\x20]NE[\x20]Crawler','netEstate NE Crawler', 'NetResearchServer/','NetResearchServer', +'nominet','nominet', 'NRLCorpusBuilder/Nutch','NRLCorpusBuilder/Nutch', 'nutch\-1\.4/','nutch-1.4', 'nutch\-1\.8/','nutch-1.8', 'NutchCVS/','NutchCVS', 'o\.uk[\x20]robot','o uk.robot', -'oBot/','oBot', 'ocrawler;','ocrawler;', 'ODP[\x20]link[\x20]checker','ODP link checker', 'Offline[\x20]Explorer/','Offline Explorer', 'OmniExplorer_Bot/','OmniExplorer_Bot', 'OrangeBot/','OrangeBot', 'PageBitesHyperBot/','PageBitesHyperBot', +'Pcore','Pcore', 'pdffillerbot/','pdffillerbot', +'peopleman','peopleman', 'PhantomJS','PhantomJS', 'PHP/5\.2\.8','PHP/5.2.8', +'Pinterestbot','Pinterestbot', 'Ploetz[\x20]\+[\x20]Zeller','Ploetz + Zeller', 'Plukkie/','Plukkie', 'Princetonbot/','Princetonbot', 'PrivacyAwareBot/','PrivacyAwareBot', +'Prlog/','Prlog', 'proximic','proximic', 'psbot/','psbot', 'psbot\-image','psbot-image', @@ -1679,9 +1747,12 @@ 'SafeSearch[\x20]microdata[\x20]crawler','SafeSearch microdata crawler', 'safesearch','safesearch ( catchall )', 'SBL\-BOT','SBL-BOT', +'scrapy','scrapy', 'Screaming[\x20]Frog[\x20]SEO[\x20]Spider/','Screaming Frog SEO Spider', 'ScreenerBot[\x20]Crawler[\x20]Beta','ScreenerBot Crawler Beta', +'Scrubby','Scrubby', 'Searchie/','Searchie', +'SecurityResearch\.bot','Security Research Bot', 'Seekmo','Seekmo', 'semanticbot','semanticbot', 'SemrushBot/','SemrushBot', @@ -1695,11 +1766,14 @@ 'Shim\-Crawler','Shim-Crawler', 'SiteExplorer/','SiteExplorer', 'siteexplorer\.info','siteexplorer.info', +'siteimprove','siteimprove', 'Slackbot\-LinkExpanding','Slackbot-LinkExpanding', 'SmabblerBot/','SmabblerBot', 'Sogou[\x20]web[\x20]spider/','Sogou web spider', 'special_archiver/','special_archiver', 'Spiderbot/','Spiderbot', +'SpuhexBot','SpuhexBot', +'spyonweb','spyonweb', 'ssearch_bot','ssearch_bot', 'SurdotlyBot/','SurdotlyBot', 'SurveyBot/','SurveyBot', @@ -1721,6 +1795,7 @@ 'vBSEO','vBSEO', 'vBulletin[\x20]via[\x20]PHP','vBulletin via PHP', 'vebidoobot','vebidoobot', +'vegi[\x20]bot','vegi bot', 'viz/Nutch','viz/Nutch', 'VoilaBot','VoilaBot', 'VORTEX/','VORTEX', @@ -1728,6 +1803,7 @@ 'W3C_Validator/','W3C_Validator', 'W3C\-checklink/','W3C-checklink', 'WBSearchBot/','WBSearchBot', +'WbSrch/','WbSrch/', 'WeSEE:Ads/PageBot','WeSEE:Ads/PageBot', 'WeSEE:Ads/PictureBot','WeSEE:Ads/PictureBot', 'WeSEE_Bot','WeSEE_Bot', @@ -1745,6 +1821,7 @@ 'Yahoo[\x20]Link[\x20]Preview','Yahoo Link Preview', 'YisouSpider','YisouSpider', 'yoozBot','yoozBot', +'Your\-Website\-Sucks','Your-Website-Sucks', 'zspider/','zspider', 'ZumBot/','ZumBot', # below placed at end to catch some generics @@ -1780,6 +1857,7 @@ 'MSIE[\x20]4','MSIE 4 - ( Rogue Robot )', 'MSIE[\x20]5','MSIE 5 - ( Rogue Robot )', 'MSIE[\x20]6','MSIE 6 - ( Rogue Robot )', +'MSIE\+6\.0\;','MSIE+6.0; - ( Rogue Robot)', 'Windows[\x20]95','Windows 95 - ( Rogue Robot )', 'Windows[\x20]98','Windows 99 - ( Rogue Robot )', @@ -1900,7 +1978,6 @@ 'cyberspyder','cyberspyder', 'datafountains/dmoz_downloader','datafountains/dmoz_downloader', 'dataprovider\.com','dataprovider.com', -'daumoa','daumoa', 'daviesbot','daviesbot', 'daylifefeedfetcher','daylifefeedfetcher', 'daypopbot','daypopbot', @@ -2537,6 +2614,7 @@ # Generic robot 'robot','robot', +'oBot/','oBot', 'blog','blog', 'checker','checker', 'crawl','crawl', @@ -2550,6 +2628,7 @@ 'sitemap','sitemap', 'spider','spider', 'sucker','sucker', +'survey','survey', 'validator','validator', 'bot[\s_+:,\.\;\/\\\-]','Unknown robot identified by bot\*', '[\s_+:,\.\;\/\\\-]bot','Unknown robot identified by \*bot',