]> git.ipfire.org Git - thirdparty/AWStats.git/commitdiff
Added new (AI) bots to robots.pm. Also fixed some inconsitencies in the bot lists.
authordpw0001 <daniel@wimpff.de>
Fri, 26 Sep 2025 11:51:13 +0000 (13:51 +0200)
committerdpw0001 <daniel@wimpff.de>
Fri, 26 Sep 2025 11:51:13 +0000 (13:51 +0200)
wwwroot/cgi-bin/lib/robots.pm

index a7b22451f6c75e144820523f1dbac820203edac3..f6903aca34f44ed02230254fb3ef23d95956c41e 100644 (file)
 #\r
 #-------------------------------------------------------\r
 \r
+# 2025-09-26 Daniel-Percy Wimpff <daniel@wimpff.de>\r
+#              Added AI2Bot (Allen Institute, https://allenai.org/crawler)\r
+#              Added Applebot-Extended (Apple AI indexer)\r
+#              Added anthropic-ai (Anthropic)\r
+#              Added ChatGPT-User (OpenAI on demand URL fetcher, https://openai.com/bot)\r
+#              Added ClaudeBot (Anthropic)\r
+#              Added claude-web (Anthropic)\r
+#              Added cohere-ai (Cohere, accordings to https://momenticmarketing.com/blog/ai-search-crawlers-bots)\r
+#              Added Diffbot\r
+#              Added DuckAssistBot (DuckDuckGo AI)\r
+#              Added facebookcatalog (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+#              Added FacebookBot (Meta, according to https://momenticmarketing.com/blog/ai-search-crawlers-bots)\r
+#              Added Google-CloudVertexBot (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+#              Added Google-Extended (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+#              Added Google-InspectionTool (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+#              Added GoogleOther (also fetches GoogleOther-Image, GoogleOther-Video, https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+#              Added meta-externalads (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+#              Added meta-externalagent (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+#              Added meta-externalfetcher (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+#              Added meta-webindexer (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+#              Added MistralAI-User (mistral.ai)\r
+#              Added OAI-SearchBot (OpenAI indexer, https://openai.com/searchbot)\r
+#              Added omgili (Webz.io, https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/)\r
+#              Added PerplexityBot (https://perplexity.ai/perplexitybot)\r
+#              Added Perplexity-User (https://perplexity.ai/perplexitybot)\r
+#              Added Storebot-Google (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+#              Added Timpibot (http://www.timpi.io)\r
+#              Added YouBot (You.com)\r
+#              \r
+#              Edited description for facebookexternalhit in %RobotsHashIDLib\r
+#              Edited entry and description for Bytespider\r
+#              Edited description for CCBot\r
+#              \r
+#              Removed facebook (not documented by Meta - substituted by adding entries for FacebookBot and meta-... crawlers)\r
+#\r
+#              Fixed: Missing entry for laserlikebot in %RobotsHashIDLib\r
+#              Fixed: Entries for MojeekBot differ in @RobotsSearchIDOrder_list1 and %RobotsHashIDLib. Using variant with trailing slash now.\r
+#              Fixed: Unescaped dashes (-) in entries Mediapartners-Google, Baiduspider-, Contacts-Crawler, DuckDuckBot-Https, Nimbostratus-Bot, Quick-Crawler, SemrushBot-SI, uni-leipzig.de\r
+#              Fixed: Unescaped dot (.) in entry bnf.fr_bot\r
+#              Fixed: Removed deprecated entry mojeek from %RobotsHashIDLib\r
+\r
 # 2024-07-17 Dinko Sotirov\r
 #              Added Amazonbot (https://developer.amazon.com/support/amazonbot)\r
 #              Added GPTBot (https://openai.com/gptbot)\r
 'Googlebot\-Image/',\r
 'Googlebot\-Mobile/',\r
 'Google[\x20]Page[\x20]Speed',\r
+'Google\-Extended/',\r
+'Google\-InspectionTool/',\r
 'google\-sitemaps',\r
 'Googlebot\-News',\r
 'Googlebot\-Video/',\r
+'GoogleOther',\r
+'Google\-CloudVertexBot',\r
 'AdsBot\-Google[\x20]\(',\r
 'AdsBot\-Google\-Mobile\-Apps',\r
 'Adsbot',\r
-'Mediapartners-Google',\r
+'Mediapartners\-Google',\r
 'Feedfetcher\-Google',\r
 'Google\-Adwords\-Instant',\r
 'Firefox/1\.5',\r
 'Yahoo![\x20]Slurp',\r
 'Baiduspider/',\r
 'Baiduspider\-image',\r
-'Baiduspider-',\r
+'Baiduspider\-',\r
 'YandexBot/',\r
 'YandexImages/',\r
 'YandexImageResizer',\r
 'arcemedia',\r
 'AdnormCrawlerCatchBot/',\r
 'adscanner',\r
+'AI2Bot',\r
 'aiHitBot/',\r
 'aipbot/',\r
 'AlphaBot',\r
+'anthropic\-ai/',\r
 'Apache\-HttpClient/',\r
 'Apexoo[\x20]Spider',\r
 'Applebot/',\r
+'Applebot\-Extended/',\r
 'archive\.org_bot',\r
 'Babya[\x20]Discoverer',\r
 'Barkrowler',\r
 'bitlybot',\r
 'bl\.uk_lddc_bot/',\r
 'BLEXBot/',\r
-'bnf.fr_bot',\r
+'bnf\.fr_bot',\r
 'boitho\.com\-dc/',\r
 'BoogleBot',\r
 'BusinessBot:',\r
 'BW/',\r
-'Bytespider',\r
+'Bytespider/',\r
 'CatchBot/',\r
 'CB/Nutch',\r
 'CCBot/',\r
+'ChatGPT\-User/',\r
 'CheckMarkNetwork/',\r
+'ClaudeBot/',\r
+'claude\-web/',\r
 'Cliqzbot/',\r
 'CMS[\x20]Crawler',\r
+'cohere\-ai/',\r
 'Companybook\-Crawler',\r
 'ConveraCrawler/',\r
-'Contacts-Crawler',\r
+'Contacts\-Crawler',\r
 'contxbot',\r
 'cosmos/',\r
 'crawl/Nutch',\r
 'daum',\r
 'Deepnet[\x20]Explorer',\r
 'DeuSu/',\r
+'Diffbot/',\r
 'Digincore',\r
 'Discordbot/',\r
 'Dispatch/',\r
 'DomainSONOCrawler/',\r
 'DomainStatsBot/',\r
 'DotBot/',\r
-'DuckDuckBot-Https',\r
+'DuckAssistBot/',\r
+'DuckDuckBot\-Https',\r
 'DuckDuckBot',\r
 'DuckDuckGo\-Favicons\-Bot/',\r
 'ELinks/',\r
 'MegaIndex\.ru/',\r
 'merzscope',\r
 'Meta_Bot',\r
+'meta\-externalads/',\r
+'meta\-externalagent/',\r
+'meta\-externalfetcher/',\r
+'meta\-webindexer/',\r
 'mfibot/',\r
 'microsoft.*discovery',\r
 'missigua_locator',\r
+'MistralAI\-User/',\r
 'MixrankBot',\r
 'MJ12bot/',\r
-'MojeekBot',\r
+'MojeekBot/',\r
 'Mojolicious',\r
 'MXT/Nutch',\r
 'My[\x20]Nutch[\x20]Spider/',\r
 'NerdyBot',\r
 'netEstate[\x20]NE[\x20]Crawler',\r
 'NetResearchServer/',\r
-'Nimbostratus-Bot',\r
+'Nimbostratus\-Bot',\r
 'nominet',\r
 'NRLCorpusBuilder/Nutch',\r
 'nutch\-1\.4/',\r
 'nutch\-1\.8/',\r
 'NutchCVS/',\r
 'o\.uk[\x20]robot',\r
+'OAI\-SearchBot/',\r
 'ocrawler;',\r
 'ODP[\x20]link[\x20]checker',\r
 'Offline[\x20]Explorer/',\r
+'omgili/',\r
 'OmniExplorer_Bot/',\r
 'OrangeBot/',\r
 'Orliac',\r
 'pdffillerbot/',\r
 'peopleman',\r
 'PetalBot',\r
+'PerplexityBot/',\r
+'Perplexity\-User/',\r
 'PhantomJS',\r
 'PHP/5\.2\.8',\r
 'Pinterestbot',\r
 'python_wk_crawler',\r
 'Python\-urllib/',\r
 'QCrawl',\r
-'Quick-Crawler',\r
+'Quick\-Crawler',\r
 'ResearchBot',\r
 'roboto',\r
 'rogerbot/',\r
 'Seekmo',\r
 'semanticbot',\r
 'SemrushBot/',\r
-'SemrushBot-SI',\r
+'SemrushBot\-SI',\r
 'seo\-audit\-check\-bot/',\r
 'Seobility',\r
 'SEOkicks\-Robot',\r
 'SpuhexBot',\r
 'spyonweb',\r
 'ssearch_bot',\r
+'Storebot\-Google/',\r
 'Streamline3Bot',\r
 'SurdotlyBot/',\r
 'SurveyBot/',\r
 'Test[\x20]Spider',\r
 'TestCrawler',\r
 'The[\x20]Knowledge[\x20]AI',\r
+'Timpibot/',\r
 'TkBot',\r
 'tracemyfile',\r
 'trendiction',\r
 'TurnitinBot',\r
 'TweetmemeBot/',\r
 'UCY/Nutch',\r
-'uni-leipzig\.de',\r
+'uni\-leipzig\.de',\r
 'Uptimebot/',\r
 'UptimeRobot/',\r
 'URL[\x20]Checker',\r
 'yak',\r
 'YisouSpider',\r
 'yoozBot',\r
+'YouBot',\r
 'Your\-Website\-Sucks',\r
 'zoominfobot',\r
 'zspider/',\r
 'extreme[_+\s]picture[_+\s]finder',\r
 'ezoom',\r
 'ezresult',\r
-'facebook',\r
+'FacebookBot/',\r
 'facebot',\r
 'fast\-search\-engine',\r
 'matrix_s\.p\.a\._\-_fast_enterprise_crawler',\r
 'Googlebot\-Image/','Googlebot-Image',\r
 'Googlebot\-Mobile/','Googlebot-Mobile',\r
 'Google[\x20]Page[\x20]Speed','Google Page Speed',\r
+'Google\-Extended/', 'Google-Extended (AI indexer, Gemini)',\r
+'Google\-InspectionTool/', 'Google-InspectionTool',\r
 'google\-sitemaps','google-sitemaps',\r
 'Googlebot\-News','Googlebot-News',\r
 'Googlebot\-Video/','Googlebot-Video',\r
+'GoogleOther', 'GoogleOther / GoogleOther-Image / GoogleOther-Video',\r
+'Google\-CloudVertexBot','Google-CloudVertexBot',\r
 'AdsBot\-Google[\x20]\(','AdsBot-Google',\r
 'AdsBot\-Google\-Mobile\-Apps','AdsBot-Google-Mobile-Apps',\r
 'Adsbot','Adsbot',\r
 'Yahoo![\x20]Slurp','Yahoo! Slurp',\r
 'Baiduspider/','Baiduspider',\r
 'Baiduspider\-image','Baiduspider-image',\r
-'Baiduspider-','Baiduspider ( catchall )',\r
+'Baiduspider\-','Baiduspider ( catchall )',\r
 'YandexBot/','YandexBot',\r
 'YandexImages/','YandexImages',\r
 'YandexImageResizer','YandexImageResizer',\r
 'Accoona\-AI\-Agent/','Accoona-AI-Agent',\r
 'AdnormCrawlerCatchBot/','AdnormCrawlerCatchBot',\r
 'adscanner','adscanner',\r
+'AI2Bot','AI2Bot (Allen Institute)',\r
 'aiHitBot/','aiHitBot',\r
 'aipbot/','aipbot',\r
 'AlphaBot','AlphaBot',\r
+'anthropic\-ai/','anthropic-ai',\r
 'Apache\-HttpClient/','Apache-HttpClient',\r
 'Apexoo[\x20]Spider','Apexoo Spider',\r
 'Applebot/','Applebot',\r
+'Applebot\-Extended/','Applebot-Extended',\r
 'arcemedia','AdsBot-ArceMedia',\r
 'archive\.org_bot','archive.org_bot',\r
 'Babya[\x20]Discoverer','Babya Discoverer',\r
 'bitlybot','bit.ly',\r
 'bl\.uk_lddc_bot/','bl.uk_lddc_bot',\r
 'BLEXBot/','BLEXBot',\r
-'bnf.fr_bot','bnf.fr_bot',\r
+'bnf\.fr_bot','bnf.fr_bot',\r
 'boitho\.com\-dc/','boitho.com-dc',\r
 'BoogleBot','BoogleBot',\r
 'BusinessBot:','BusinessBot:',\r
 'BW/','BW',\r
-'Bytespider','Bytespider',\r
+'Bytespider/','Bytespider (Bytedance)',\r
 'CatchBot/','CatchBot',\r
 'CB/Nutch','CB/Nutch',\r
-'CCBot/','CCBot',\r
+'CCBot/','CCBot (Common Crawl, open/free AI dataset)',\r
+'ChatGPT\-User/','ChatGPT-User (OpenAI)',\r
 'CheckMarkNetwork/','CheckMarkNetwork',\r
+'ClaudeBot/','ClaudeBot (Anthropic)',\r
+'claude\-web/','claude-web (Anthropic)',\r
 'Cliqzbot/','Cliqzbot',\r
 'CMS[\x20]Crawler','CMS Crawler',\r
+'cohere\-ai/','cohere-ai',\r
 'Companybook\-Crawler','Companybook-Crawler',\r
 'ConveraCrawler/','ConveraCrawler',\r
-'Contacts-Crawler','Contacts-Crawler',\r
+'Contacts\-Crawler','Contacts-Crawler',\r
 'contxbot','contxbot',\r
 'cosmos/','cosmos',\r
 'CRMNLCrawlAgent','CRMNLCrawlAgent',\r
 'daum','daum',\r
 'Deepnet[\x20]Explorer','Deepnet Explorer',\r
 'DeuSu/','DeuSu',\r
+'Diffbot/', 'Diffbot',\r
 'Digincore','Digincore',\r
 'Discordbot/','Discordbot',\r
 'Dispatch/','Dispatch',\r
 'DomainSONOCrawler/','DomainSONOCrawler',\r
 'DomainStatsBot/','DomainStatsBot',\r
 'DotBot/','DotBot',\r
-'DuckDuckBot-Https','DuckDuckBot-Https',\r
+'DuckAssistBot/', 'DuckAssist (DuckDuckGo AI)',\r
+'DuckDuckBot\-Https','DuckDuckBot-Https',\r
 'DuckDuckBot','DuckDuckBot',\r
 'DuckDuckGo\-Favicons\-Bot/','DuckDuckGo-Favicons-Bot',\r
 'ELinks/','ELinks',\r
 'Exabot/','Exabot',\r
 'ExtLinksBot','ExtLinksBot',\r
 'ExperianCrawlUK','ExperianCrawlUK',\r
-'facebookexternalhit/','facebookexternalhit',\r
+'facebookexternalhit/','facebookexternalhit (Meta/Facebook/Instagram shared link)',\r
 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise crawleradmin.t-info@telekom.de',\r
 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise T-Info_BI_cluster crawleradmin.t-info@telekom.de',\r
 'FAST\-WebCrawler/','FAST-WebCrawler',\r
 'Jooblebot','Jooblebot',\r
 'KomodiaBot/','KomodiaBot',\r
 'Konqueror/','Konqueror',\r
+'laserlikebot','laserlikebot',\r
 'Lightspeed','Lightspeed',\r
 'linkapediabot','linkapediabot',\r
 'metager\-linkchecker','metager-linkchecker',\r
 'MegaIndex\.ru/','MegaIndex.ru',\r
 'merzscope','merzscope',\r
 'Meta_Bot','Meta_Bot',\r
+'meta\-externalads/','meta-externalads (Meta/Facebook/Instagram) ads)',\r
+'meta\-externalagent/','meta-externalagent (Meta/Facebook/Instagram indexer + AI)',\r
+'meta\-externalfetcher/','meta-externalfetcher (Meta/Facebook/Instagram user-initiated fetch)',\r
+'meta\-webindexer/','meta-webindexer (Meta/Facebook/Instagram indexer + AI)',\r
 'mfibot/','mfibot',\r
 'microsoft.*discovery','Microsoft Office Protocol Discovery',\r
 'missigua_locator','missigua_locator',\r
+'MistralAI\-User/','MistralAI-User',\r
 'MixrankBot','MixrankBot',\r
 'MJ12bot/','MJ12bot',\r
-'mojeek','mojeek',\r
 'MojeekBot/','MojeekBot',\r
 'Mojolicious','Mojolicious',\r
 'MXT/Nutch','MXT/Nutch',\r
 'NerdyBot','NerdyBot',\r
 'netEstate[\x20]NE[\x20]Crawler','netEstate NE Crawler',\r
 'NetResearchServer/','NetResearchServer',\r
-'Nimbostratus-Bot','Nimbostratus-Bot',\r
+'Nimbostratus\-Bot','Nimbostratus-Bot',\r
 'nominet','nominet',\r
 'NRLCorpusBuilder/Nutch','NRLCorpusBuilder/Nutch',\r
 'nutch\-1\.4/','nutch-1.4',\r
 'NutchCVS/','NutchCVS',\r
 'o\.uk[\x20]robot','o uk.robot',\r
 'ocrawler;','ocrawler;',\r
+'OAI\-SearchBot/','OAI-SearchBot (OpenAI)',\r
 'ODP[\x20]link[\x20]checker','ODP link checker',\r
 'Offline[\x20]Explorer/','Offline Explorer',\r
+'omgili/','omgili (webz.io)',\r
 'OmniExplorer_Bot/','OmniExplorer_Bot',\r
 'OrangeBot/','OrangeBot',\r
 'Orliac','Orliac',\r
 'pdffillerbot/','pdffillerbot',\r
 'peopleman','peopleman',\r
 'PetalBot','PetalBot',\r
+'PerplexityBot/','PerplexityBot',\r
+'Perplexity\-User/', 'Perplexity-User',\r
 'PhantomJS','PhantomJS',\r
 'PHP/5\.2\.8','PHP/5.2.8',\r
 'Pinterestbot','Pinterestbot',\r
 'python_wk_crawler','python_wk_crawler',\r
 'Python\-urllib/','Python-urllib',\r
 'QCrawl','QCrawl',\r
-'Quick-Crawler','Quick-Crawler',\r
+'Quick\-Crawler','Quick-Crawler',\r
 'ResearchBot','ResearchBot',\r
 'roboto','roboto',\r
 'rogerbot/','rogerbot',\r
 'Seekmo','Seekmo',\r
 'semanticbot','semanticbot',\r
 'SemrushBot/','SemrushBot',\r
-'SemrushBot-SI','SemrushBot-SI',\r
+'SemrushBot\-SI','SemrushBot-SI',\r
 'seo\-audit\-check\-bot/','seo-audit-check-bot',\r
 'Seobility','Seobility',\r
 'SEOkicks\-Robot','SEOkicks-Robot',\r
 'SpuhexBot','SpuhexBot',\r
 'spyonweb','spyonweb',\r
 'ssearch_bot','ssearch_bot',\r
+'Storebot\-Google/','Storebot-Google',\r
 'Streamline3Bot','Streamline3Bot',\r
 'SurdotlyBot/','SurdotlyBot',\r
 'SurveyBot/','SurveyBot',\r
 'Test[\x20]Spider','Test Spider',\r
 'TestCrawler','TestCrawler',\r
 'The[\x20]Knowledge[\x20]AI', 'The Knowledge AI',\r
+'Timpibot/','Timpibot (timpi.io)',\r
 'TkBot','TkBot',\r
 'tracemyfile','tracemyfile',\r
 'trendiction','trendiction',\r
 'TurnitinBot','TurnitinBot',\r
 'TweetmemeBot/','TweetmemeBot',\r
 'UCY/Nutch','UCY/Nutch',\r
-'uni-leipzig\.de','uni-leipzig.de',\r
+'uni\-leipzig\.de','uni-leipzig.de',\r
 'Uptimebot/','Uptimebot',\r
 'UptimeRobot/','UptimeRobot',\r
 'URL[\x20]Checker','URL Checker',\r
 'yak','yak-linkfluence',\r
 'YisouSpider','YisouSpider',\r
 'yoozBot','yoozBot',\r
+'YouBot','YouBot (You.com)',\r
 'Your\-Website\-Sucks','Your-Website-Sucks',\r
 'zoominfobot','zoominfobot',\r
 'zspider/','zspider',\r
 'extreme[_+\s]picture[_+\s]finder','extreme picture finder',\r
 'ezoom','ezoom',\r
 'ezresult','ezresult',\r
-'facebook','facebook',\r
+'FacebookBot/','FacebookBot',\r
 'facebot','facebot',\r
 'fast\-search\-engine','fast-search-engine',\r
 'matrix_s\.p\.a\._\-_fast_enterprise_crawler','matrix_s.p.a._-_fast_enterprise_crawler',\r