From 2698a61e6c5e71a3270ceedc9aea7a3dbe7b8ad2 Mon Sep 17 00:00:00 2001 From: Mikel Olasagasti Uranga Date: Fri, 13 Sep 2019 20:22:57 +0200 Subject: [PATCH] Update robots.pm with PR118 data. Also added: - PiplBot bot - um-IC & um-LN bot - arcemedia - bit.ly - bidswitchbot - bnf.fr_bot - contxbot - flamingo - getintent (variant) - laserlikebot - mappy - mojeek (variant) - serendeputy - trendiction - yak (linkinfluence) - zoominfobot Signed-off-by: Mikel Olasagasti Uranga --- wwwroot/cgi-bin/lib/robots.pm | 88 ++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index 856acd95..19a95905 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -429,6 +429,7 @@ 'baidu', 'YandexBot/', 'YandexImages/', +'YandexImageResizer', 'YandexMetrika/', 'YandexMobileBot/', 'yandex', @@ -444,7 +445,9 @@ 'Abrave', 'acapbot/', 'Accoona\-AI\-Agent/', +'arcemedia', 'AdnormCrawlerCatchBot/', +'adscanner', 'aiHitBot/', 'aipbot/', 'AlphaBot', @@ -458,10 +461,13 @@ 'BellPagesCA/', 'BeNosey[\x20]Mohawk[\x20]Search', 'bhcBot', +'bidswitchbot', 'BigBozz/', 'BinGet/', +'bitlybot', 'bl\.uk_lddc_bot/', 'BLEXBot/', +'bnf.fr_bot', 'boitho\.com\-dc/', 'BoogleBot', 'BusinessBot:', @@ -470,14 +476,20 @@ 'CCBot/', 'Cliqzbot/', 'CMS[\x20]Crawler', +'Companybook\-Crawler', 'ConveraCrawler/', +'Contacts-Crawler', +'contxbot', 'cosmos/', 'crawl/Nutch', 'crawler4j', 'CRAZYWEBCRAWLER', +'CRMNLCrawlAgent', 'CSE[\x20]HTML[\x20]Validator', 'C\-T[\x20]bot', +'CUBOT', 'Curl/PHP', +'cyencebot', 'Dalvik/', 'DataCrawler/', 'daumoa', @@ -495,14 +507,18 @@ 'DomainSONOCrawler/', 'DomainStatsBot/', 'DotBot/', +'DuckDuckBot-Https', 'DuckDuckGo\-Favicons\-Bot/', 'ELinks/', 'ELinks[\x20]\(', 'EmailMarketingRobot/', 'EmeraldShield\.com[\x20]WebBot', 'envolk\[ITS\]spider/', +'eright', 'EsperanzaBot', 'Exabot/', +'ExtLinksBot', +'ExperianCrawlUK', 'facebookexternalhit/', 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', @@ -512,6 +528,7 @@ 'Findxbot/', 'FirePHP/', 'firstdirectory\-bot', +'flamingo', 'FlippyBearBot/', '^foo$', 'FreeWebMonitoring[\x20]SiteChecker/', @@ -520,12 +537,14 @@ 'Gaisbot/', 'Gallent[\x20]Spider', 'GarlikCrawler/', +'Getintent[\x20]Crawler', 'GetintentCrawler[\x20]getintent\.com', 'Gigabot/', 'gipo\-crawler/Nutch', 'Girafabot', 'Gluten[\x20]Free[\x20]Crawler/', 'gocrawl', +'Gowikibot', 'Go\-http\-client/', 'GrapeshotCrawler/', 'GSiteCrawler/', @@ -545,6 +564,7 @@ 'idmarch[\x20]Automatic\.beta/', 'InbyBot', 'Incutio[\x20]XML', +'IndeedBot', 'InfluenceBot', 'IRLbot/', 'IssueCrawler', @@ -555,6 +575,8 @@ 'Jooblebot', 'KomodiaBot/', 'Konqueror/', +'laserlikebot', +'Lightspeed', 'linkapediabot', 'metager\-linkchecker', 'linkchecker', @@ -566,24 +588,30 @@ 'LinksManager\.com_bot', 'LWP::Simple/', 'Mail\.RU_Bot/', +'makecontact', +'mappy', +'MauiBot', 'meanpathbot/', 'Mechanize', 'Mediatoolkitbot', 'MegaIndex\.ru/', 'merzscope', +'Meta_Bot', 'mfibot/', 'microsoft.*discovery', 'missigua_locator', 'MixrankBot', 'MJ12bot/', -'MojeekBot/', +'MojeekBot', 'Mojolicious', 'MXT/Nutch', 'My[\x20]Nutch[\x20]Spider/', 'myse/Nutch', +'Naaraa', 'NerdyBot', 'netEstate[\x20]NE[\x20]Crawler', 'NetResearchServer/', +'Nimbostratus-Bot', 'nominet', 'NRLCorpusBuilder/Nutch', 'nutch\-1\.4/', @@ -595,6 +623,8 @@ 'Offline[\x20]Explorer/', 'OmniExplorer_Bot/', 'OrangeBot/', +'Orliac', +'OutclicksBot', 'PageBitesHyperBot/', 'Pcore', 'pdffillerbot/', @@ -602,6 +632,7 @@ 'PhantomJS', 'PHP/5\.2\.8', 'Pinterestbot', +'PiplBot', 'Ploetz[\x20]\+[\x20]Zeller', 'Plukkie/', 'Princetonbot/', @@ -613,6 +644,7 @@ 'python_wk_crawler', 'Python\-urllib/', 'QCrawl', +'Quick-Crawler', 'ResearchBot', 'roboto', 'rogerbot/', @@ -632,6 +664,7 @@ 'Seekmo', 'semanticbot', 'SemrushBot/', +'SemrushBot-SI', 'seo\-audit\-check\-bot/', 'Seobility', 'SEOkicks\-Robot', @@ -639,6 +672,7 @@ 'SEOstats', 'Seosys/Nutch', 'Seoterritory\.com[\x20]bot', +'serendeputy', 'Shim\-Crawler', 'SiteExplorer/', 'siteexplorer\.info', @@ -651,6 +685,7 @@ 'SpuhexBot', 'spyonweb', 'ssearch_bot', +'Streamline3Bot', 'SurdotlyBot/', 'SurveyBot/', 'taiil/Nutch', @@ -661,10 +696,13 @@ 'Test[\x20]Spider', 'TestCrawler', 'The[\x20]Knowledge[\x20]AI', +'tracemyfile', +'trendiction', 'TurnitinBot/', 'TurnitinBot', 'TweetmemeBot/', 'UCY/Nutch', +'uni-leipzig\.de', 'Uptimebot/', 'UptimeRobot/', 'URL[\x20]Checker', @@ -674,6 +712,7 @@ 'vBulletin[\x20]via[\x20]PHP', 'vebidoobot', 'vegi[\x20]bot', +'Velen', 'viz/Nutch', 'VoilaBot', 'VORTEX/', @@ -697,9 +736,11 @@ 'XoviBot/', 'yacybot', 'Yahoo[\x20]Link[\x20]Preview', +'yak', 'YisouSpider', 'yoozBot', 'Your\-Website\-Sucks', +'zoominfobot', 'zspider/', 'ZumBot/', # below placed at end to catch some generics @@ -1364,6 +1405,8 @@ 'ucsd', 'udmsearch', 'ultraseek', +'um\-IC', +'um\-LN', 'unchaos_bot_hybrid_web_search_engine', 'unido\-bot', 'unisterbot', @@ -1556,6 +1599,7 @@ 'baidu','Baidu ( catchall )', 'YandexBot/','YandexBot', 'YandexImages/','YandexImages', +'YandexImageResizer','YandexImageResizer', 'YandexMetrika/','YandexMetrika', 'YandexMobileBot/','YandexMobileBot', 'yandex','Yandex ( catchall )', @@ -1572,12 +1616,14 @@ 'acapbot/','acapbot', 'Accoona\-AI\-Agent/','Accoona-AI-Agent', 'AdnormCrawlerCatchBot/','AdnormCrawlerCatchBot', +'adscanner','adscanner', 'aiHitBot/','aiHitBot', 'aipbot/','aipbot', 'AlphaBot','AlphaBot', 'Apache\-HttpClient/','Apache-HttpClient', 'Apexoo[\x20]Spider','Apexoo Spider', 'Applebot/','Applebot', +'arcemedia','AdsBot-ArceMedia', 'archive\.org_bot','archive.org_bot', 'Babya[\x20]Discoverer','Babya Discoverer', 'Barkrowler','Barkrowler', @@ -1585,10 +1631,13 @@ 'BellPagesCA/','BellPagesCA', 'BeNosey[\x20]Mohawk[\x20]Search','BeNosey Mohawk Search', 'bhcBot','bhcBot', +'bidswitchbot','bidswitchbot', 'BigBozz/','BigBozz', 'BinGet/','BinGet', +'bitlybot','bit.ly', 'bl\.uk_lddc_bot/','bl.uk_lddc_bot', 'BLEXBot/','BLEXBot', +'bnf.fr_bot','bnf.fr_bot', 'boitho\.com\-dc/','boitho.com-dc', 'BoogleBot','BoogleBot', 'BusinessBot:','BusinessBot:', @@ -1597,14 +1646,20 @@ 'CCBot/','CCBot', 'Cliqzbot/','Cliqzbot', 'CMS[\x20]Crawler','CMS Crawler', +'Companybook\-Crawler','Companybook-Crawler', 'ConveraCrawler/','ConveraCrawler', +'Contacts-Crawler','Contacts-Crawler', +'contxbot','contxbot', 'cosmos/','cosmos', +'CRMNLCrawlAgent','CRMNLCrawlAgent', 'crawl/Nutch','crawl/Nutch', 'crawler4j','crawler4j', 'CRAZYWEBCRAWLER','CRAZYWEBCRAWLER', 'CSE[\x20]HTML[\x20]Validator','CSE HTML Validator', 'C\-T[\x20]bot','C-T bot', +'CUBOT','CUBOT', 'Curl/PHP','Curl/PHP', +'cyencebot','cyencebot', 'Dalvik/','Dalvik', 'DataCrawler/','DataCrawler', 'daumoa','daumoa', @@ -1622,14 +1677,18 @@ 'DomainSONOCrawler/','DomainSONOCrawler', 'DomainStatsBot/','DomainStatsBot', 'DotBot/','DotBot', +'DuckDuckBot-Https','DuckDuckBot-Https', 'DuckDuckGo\-Favicons\-Bot/','DuckDuckGo-Favicons-Bot', 'ELinks/','ELinks', 'ELinks[\x20]\(','ELinks (', 'EmailMarketingRobot/','EmailMarketingRobot', 'EmeraldShield\.com[\x20]WebBot','EmeraldShield.com WebBot', 'envolk\[ITS\]spider/','envolk ITS spider', +'eright','eright', 'EsperanzaBot','EsperanzaBot', 'Exabot/','Exabot', +'ExtLinksBot','ExtLinksBot', +'ExperianCrawlUK','ExperianCrawlUK', 'facebookexternalhit/','facebookexternalhit', 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise crawleradmin.t-info@telekom.de', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise T-Info_BI_cluster crawleradmin.t-info@telekom.de', @@ -1639,6 +1698,7 @@ 'Findxbot/','Findxbot', 'FirePHP/','FirePHP', 'firstdirectory\-bot','firstdirectory-bot', +'flamingo','Flamingo_SearchEngine', 'FlippyBearBot/','FlippyBearBot', '^foo$','foo', 'FreeWebMonitoring[\x20]SiteChecker/','FreeWebMonitoring SiteChecker', @@ -1647,12 +1707,14 @@ 'Gaisbot/','Gaisbot', 'Gallent[\x20]Spider','Gallent Spider', 'GarlikCrawler/','GarlikCrawler', +'Getintent[\x20]Crawler','GetIntent Crawler', 'GetintentCrawler[\x20]getintent\.com','GetintentCrawler getintent.com', 'Gigabot/','Gigabot', 'gipo\-crawler/Nutch','gipo-crawler/Nutch', 'Girafabot','Girafabot', 'Gluten[\x20]Free[\x20]Crawler/','Gluten Free Crawler', 'gocrawl','gocrawl', +'Gowikibot','Gowikibot', 'Go\-http\-client/','Go-http-client', 'GrapeshotCrawler/','GrapeshotCrawler', 'GSiteCrawler/','GSiteCrawler', @@ -1672,6 +1734,7 @@ 'idmarch[\x20]Automatic\.beta/','idmarch Automatic.beta', 'InbyBot','InbyBot', 'Incutio[\x20]XML','Incutio XML', +'IndeedBot','IndeedBot', 'InfluenceBot','InfluenceBot', 'IRLbot/','IRLbot', 'IssueCrawler','IssueCrawler', @@ -1682,6 +1745,7 @@ 'Jooblebot','Jooblebot', 'KomodiaBot/','KomodiaBot', 'Konqueror/','Konqueror', +'Lightspeed','Lightspeed', 'linkapediabot','linkapediabot', 'metager\-linkchecker','metager-linkchecker', 'linkchecker','linkchecker', @@ -1693,24 +1757,31 @@ 'LinksManager\.com_bot','LinksManager.com_bot', 'LWP::Simple/','LWP::Simple', 'Mail\.RU_Bot/','Mail.RU Bot', +'makecontact','makecontact', +'mappy','Mappy Crawler', +'MauiBot','MauiBot', 'meanpathbot/','meanpathbot', 'Mechanize','Mechanize', 'Mediatoolkitbot','Mediatoolkitbot', 'MegaIndex\.ru/','MegaIndex.ru', 'merzscope','merzscope', +'Meta_Bot','Meta_Bot', 'mfibot/','mfibot', 'microsoft.*discovery','Microsoft Office Protocol Discovery', 'missigua_locator','missigua_locator', 'MixrankBot','MixrankBot', 'MJ12bot/','MJ12bot', +'mojeek','mojeek', 'MojeekBot/','MojeekBot', 'Mojolicious','Mojolicious', 'MXT/Nutch','MXT/Nutch', 'My[\x20]Nutch[\x20]Spider/','My Nutch Spider', 'myse/Nutch','myse/Nutch', +'Naaraa','Naaraa', 'NerdyBot','NerdyBot', 'netEstate[\x20]NE[\x20]Crawler','netEstate NE Crawler', 'NetResearchServer/','NetResearchServer', +'Nimbostratus-Bot','Nimbostratus-Bot', 'nominet','nominet', 'NRLCorpusBuilder/Nutch','NRLCorpusBuilder/Nutch', 'nutch\-1\.4/','nutch-1.4', @@ -1722,6 +1793,8 @@ 'Offline[\x20]Explorer/','Offline Explorer', 'OmniExplorer_Bot/','OmniExplorer_Bot', 'OrangeBot/','OrangeBot', +'Orliac','Orliac', +'OutclicksBot','OutclicksBot', 'PageBitesHyperBot/','PageBitesHyperBot', 'Pcore','Pcore', 'pdffillerbot/','pdffillerbot', @@ -1729,6 +1802,7 @@ 'PhantomJS','PhantomJS', 'PHP/5\.2\.8','PHP/5.2.8', 'Pinterestbot','Pinterestbot', +'PiplBot','PiplBot', 'Ploetz[\x20]\+[\x20]Zeller','Ploetz + Zeller', 'Plukkie/','Plukkie', 'Princetonbot/','Princetonbot', @@ -1740,6 +1814,7 @@ 'python_wk_crawler','python_wk_crawler', 'Python\-urllib/','Python-urllib', 'QCrawl','QCrawl', +'Quick-Crawler','Quick-Crawler', 'ResearchBot','ResearchBot', 'roboto','roboto', 'rogerbot/','rogerbot', @@ -1759,6 +1834,7 @@ 'Seekmo','Seekmo', 'semanticbot','semanticbot', 'SemrushBot/','SemrushBot', +'SemrushBot-SI','SemrushBot-SI', 'seo\-audit\-check\-bot/','seo-audit-check-bot', 'Seobility','Seobility', 'SEOkicks\-Robot','SEOkicks-Robot', @@ -1766,6 +1842,7 @@ 'SEOstats','SEOstats', 'Seosys/Nutch','Seosys/Nutch', 'Seoterritory\.com[\x20]bot','Seoterritory.com.bot', +'serendeputy','serendeputy', 'Shim\-Crawler','Shim-Crawler', 'SiteExplorer/','SiteExplorer', 'siteexplorer\.info','siteexplorer.info', @@ -1778,6 +1855,7 @@ 'SpuhexBot','SpuhexBot', 'spyonweb','spyonweb', 'ssearch_bot','ssearch_bot', +'Streamline3Bot','Streamline3Bot', 'SurdotlyBot/','SurdotlyBot', 'SurveyBot/','SurveyBot', 'taiil/Nutch','taiil/Nutch', @@ -1788,10 +1866,13 @@ 'Test[\x20]Spider','Test Spider', 'TestCrawler','TestCrawler', 'The[\x20]Knowledge[\x20]AI', 'The Knowledge AI', +'tracemyfile','tracemyfile', +'trendiction','trendiction', 'TurnitinBot/','TurnitinBot', 'TurnitinBot','TurnitinBot', 'TweetmemeBot/','TweetmemeBot', 'UCY/Nutch','UCY/Nutch', +'uni-leipzig\.de','uni-leipzig.de', 'Uptimebot/','Uptimebot', 'UptimeRobot/','UptimeRobot', 'URL[\x20]Checker','URL Checker', @@ -1801,6 +1882,7 @@ 'vBulletin[\x20]via[\x20]PHP','vBulletin via PHP', 'vebidoobot','vebidoobot', 'vegi[\x20]bot','vegi bot', +'Velen','Velen', 'viz/Nutch','viz/Nutch', 'VoilaBot','VoilaBot', 'VORTEX/','VORTEX', @@ -1824,9 +1906,11 @@ 'XoviBot/','XoviBot', 'yacybot','yacybot', 'Yahoo[\x20]Link[\x20]Preview','Yahoo Link Preview', +'yak','yak-linkfluence', 'YisouSpider','YisouSpider', 'yoozBot','yoozBot', 'Your\-Website\-Sucks','Your-Website-Sucks', +'zoominfobot','zoominfobot', 'zspider/','zspider', 'ZumBot/','ZumBot', # below placed at end to catch some generics @@ -2488,6 +2572,8 @@ 'ucsd','ucsd', 'udmsearch','udmsearch', 'ultraseek','ultraseek', +'um\-IC','ubermetrics-technologies.com', +'um\-LN','ubermetrics-technologies.com', 'unchaos_bot_hybrid_web_search_engine','unchaos_bot_hybrid_web_search_engine', 'unido\-bot','unido-bot', 'unisterbot','unisterbot', -- 2.47.2