#\r
#-------------------------------------------------------\r
\r
+# 2025-09-26 Daniel-Percy Wimpff <daniel@wimpff.de>\r
+# Added AI2Bot (Allen Institute, https://allenai.org/crawler)\r
+# Added Applebot-Extended (Apple AI indexer)\r
+# Added anthropic-ai (Anthropic)\r
+# Added ChatGPT-User (OpenAI on demand URL fetcher, https://openai.com/bot)\r
+# Added ClaudeBot (Anthropic)\r
+# Added claude-web (Anthropic)\r
+# Added cohere-ai (Cohere, accordings to https://momenticmarketing.com/blog/ai-search-crawlers-bots)\r
+# Added Diffbot\r
+# Added DuckAssistBot (DuckDuckGo AI)\r
+# Added facebookcatalog (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+# Added FacebookBot (Meta, according to https://momenticmarketing.com/blog/ai-search-crawlers-bots)\r
+# Added Google-CloudVertexBot (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+# Added Google-Extended (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+# Added Google-InspectionTool (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+# Added GoogleOther (also fetches GoogleOther-Image, GoogleOther-Video, https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+# Added meta-externalads (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+# Added meta-externalagent (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+# Added meta-externalfetcher (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+# Added meta-webindexer (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)\r
+# Added MistralAI-User (mistral.ai)\r
+# Added OAI-SearchBot (OpenAI indexer, https://openai.com/searchbot)\r
+# Added omgili (Webz.io, https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/)\r
+# Added PerplexityBot (https://perplexity.ai/perplexitybot)\r
+# Added Perplexity-User (https://perplexity.ai/perplexitybot)\r
+# Added Storebot-Google (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers)\r
+# Added Timpibot (http://www.timpi.io)\r
+# Added YouBot (You.com)\r
+# \r
+# Edited description for facebookexternalhit in %RobotsHashIDLib\r
+# Edited entry and description for Bytespider\r
+# Edited description for CCBot\r
+# \r
+# Removed facebook (not documented by Meta - substituted by adding entries for FacebookBot and meta-... crawlers)\r
+#\r
+# Fixed: Missing entry for laserlikebot in %RobotsHashIDLib\r
+# Fixed: Entries for MojeekBot differ in @RobotsSearchIDOrder_list1 and %RobotsHashIDLib. Using variant with trailing slash now.\r
+# Fixed: Unescaped dashes (-) in entries Mediapartners-Google, Baiduspider-, Contacts-Crawler, DuckDuckBot-Https, Nimbostratus-Bot, Quick-Crawler, SemrushBot-SI, uni-leipzig.de\r
+# Fixed: Unescaped dot (.) in entry bnf.fr_bot\r
+# Fixed: Removed deprecated entry mojeek from %RobotsHashIDLib\r
+\r
# 2024-07-17 Dinko Sotirov\r
# Added Amazonbot (https://developer.amazon.com/support/amazonbot)\r
# Added GPTBot (https://openai.com/gptbot)\r
'Googlebot\-Image/',\r
'Googlebot\-Mobile/',\r
'Google[\x20]Page[\x20]Speed',\r
+'Google\-Extended/',\r
+'Google\-InspectionTool/',\r
'google\-sitemaps',\r
'Googlebot\-News',\r
'Googlebot\-Video/',\r
+'GoogleOther',\r
+'Google\-CloudVertexBot',\r
'AdsBot\-Google[\x20]\(',\r
'AdsBot\-Google\-Mobile\-Apps',\r
'Adsbot',\r
-'Mediapartners-Google',\r
+'Mediapartners\-Google',\r
'Feedfetcher\-Google',\r
'Google\-Adwords\-Instant',\r
'Firefox/1\.5',\r
'Yahoo![\x20]Slurp',\r
'Baiduspider/',\r
'Baiduspider\-image',\r
-'Baiduspider-',\r
+'Baiduspider\-',\r
'YandexBot/',\r
'YandexImages/',\r
'YandexImageResizer',\r
'arcemedia',\r
'AdnormCrawlerCatchBot/',\r
'adscanner',\r
+'AI2Bot',\r
'aiHitBot/',\r
'aipbot/',\r
'AlphaBot',\r
+'anthropic\-ai/',\r
'Apache\-HttpClient/',\r
'Apexoo[\x20]Spider',\r
'Applebot/',\r
+'Applebot\-Extended/',\r
'archive\.org_bot',\r
'Babya[\x20]Discoverer',\r
'Barkrowler',\r
'bitlybot',\r
'bl\.uk_lddc_bot/',\r
'BLEXBot/',\r
-'bnf.fr_bot',\r
+'bnf\.fr_bot',\r
'boitho\.com\-dc/',\r
'BoogleBot',\r
'BusinessBot:',\r
'BW/',\r
-'Bytespider',\r
+'Bytespider/',\r
'CatchBot/',\r
'CB/Nutch',\r
'CCBot/',\r
+'ChatGPT\-User/',\r
'CheckMarkNetwork/',\r
+'ClaudeBot/',\r
+'claude\-web/',\r
'Cliqzbot/',\r
'CMS[\x20]Crawler',\r
+'cohere\-ai/',\r
'Companybook\-Crawler',\r
'ConveraCrawler/',\r
-'Contacts-Crawler',\r
+'Contacts\-Crawler',\r
'contxbot',\r
'cosmos/',\r
'crawl/Nutch',\r
'daum',\r
'Deepnet[\x20]Explorer',\r
'DeuSu/',\r
+'Diffbot/',\r
'Digincore',\r
'Discordbot/',\r
'Dispatch/',\r
'DomainSONOCrawler/',\r
'DomainStatsBot/',\r
'DotBot/',\r
-'DuckDuckBot-Https',\r
+'DuckAssistBot/',\r
+'DuckDuckBot\-Https',\r
'DuckDuckBot',\r
'DuckDuckGo\-Favicons\-Bot/',\r
'ELinks/',\r
'MegaIndex\.ru/',\r
'merzscope',\r
'Meta_Bot',\r
+'meta\-externalads/',\r
+'meta\-externalagent/',\r
+'meta\-externalfetcher/',\r
+'meta\-webindexer/',\r
'mfibot/',\r
'microsoft.*discovery',\r
'missigua_locator',\r
+'MistralAI\-User/',\r
'MixrankBot',\r
'MJ12bot/',\r
-'MojeekBot',\r
+'MojeekBot/',\r
'Mojolicious',\r
'MXT/Nutch',\r
'My[\x20]Nutch[\x20]Spider/',\r
'NerdyBot',\r
'netEstate[\x20]NE[\x20]Crawler',\r
'NetResearchServer/',\r
-'Nimbostratus-Bot',\r
+'Nimbostratus\-Bot',\r
'nominet',\r
'NRLCorpusBuilder/Nutch',\r
'nutch\-1\.4/',\r
'nutch\-1\.8/',\r
'NutchCVS/',\r
'o\.uk[\x20]robot',\r
+'OAI\-SearchBot/',\r
'ocrawler;',\r
'ODP[\x20]link[\x20]checker',\r
'Offline[\x20]Explorer/',\r
+'omgili/',\r
'OmniExplorer_Bot/',\r
'OrangeBot/',\r
'Orliac',\r
'pdffillerbot/',\r
'peopleman',\r
'PetalBot',\r
+'PerplexityBot/',\r
+'Perplexity\-User/',\r
'PhantomJS',\r
'PHP/5\.2\.8',\r
'Pinterestbot',\r
'python_wk_crawler',\r
'Python\-urllib/',\r
'QCrawl',\r
-'Quick-Crawler',\r
+'Quick\-Crawler',\r
'ResearchBot',\r
'roboto',\r
'rogerbot/',\r
'Seekmo',\r
'semanticbot',\r
'SemrushBot/',\r
-'SemrushBot-SI',\r
+'SemrushBot\-SI',\r
'seo\-audit\-check\-bot/',\r
'Seobility',\r
'SEOkicks\-Robot',\r
'SpuhexBot',\r
'spyonweb',\r
'ssearch_bot',\r
+'Storebot\-Google/',\r
'Streamline3Bot',\r
'SurdotlyBot/',\r
'SurveyBot/',\r
'Test[\x20]Spider',\r
'TestCrawler',\r
'The[\x20]Knowledge[\x20]AI',\r
+'Timpibot/',\r
'TkBot',\r
'tracemyfile',\r
'trendiction',\r
'TurnitinBot',\r
'TweetmemeBot/',\r
'UCY/Nutch',\r
-'uni-leipzig\.de',\r
+'uni\-leipzig\.de',\r
'Uptimebot/',\r
'UptimeRobot/',\r
'URL[\x20]Checker',\r
'yak',\r
'YisouSpider',\r
'yoozBot',\r
+'YouBot',\r
'Your\-Website\-Sucks',\r
'zoominfobot',\r
'zspider/',\r
'extreme[_+\s]picture[_+\s]finder',\r
'ezoom',\r
'ezresult',\r
-'facebook',\r
+'FacebookBot/',\r
'facebot',\r
'fast\-search\-engine',\r
'matrix_s\.p\.a\._\-_fast_enterprise_crawler',\r
'Googlebot\-Image/','Googlebot-Image',\r
'Googlebot\-Mobile/','Googlebot-Mobile',\r
'Google[\x20]Page[\x20]Speed','Google Page Speed',\r
+'Google\-Extended/', 'Google-Extended (AI indexer, Gemini)',\r
+'Google\-InspectionTool/', 'Google-InspectionTool',\r
'google\-sitemaps','google-sitemaps',\r
'Googlebot\-News','Googlebot-News',\r
'Googlebot\-Video/','Googlebot-Video',\r
+'GoogleOther', 'GoogleOther / GoogleOther-Image / GoogleOther-Video',\r
+'Google\-CloudVertexBot','Google-CloudVertexBot',\r
'AdsBot\-Google[\x20]\(','AdsBot-Google',\r
'AdsBot\-Google\-Mobile\-Apps','AdsBot-Google-Mobile-Apps',\r
'Adsbot','Adsbot',\r
'Yahoo![\x20]Slurp','Yahoo! Slurp',\r
'Baiduspider/','Baiduspider',\r
'Baiduspider\-image','Baiduspider-image',\r
-'Baiduspider-','Baiduspider ( catchall )',\r
+'Baiduspider\-','Baiduspider ( catchall )',\r
'YandexBot/','YandexBot',\r
'YandexImages/','YandexImages',\r
'YandexImageResizer','YandexImageResizer',\r
'Accoona\-AI\-Agent/','Accoona-AI-Agent',\r
'AdnormCrawlerCatchBot/','AdnormCrawlerCatchBot',\r
'adscanner','adscanner',\r
+'AI2Bot','AI2Bot (Allen Institute)',\r
'aiHitBot/','aiHitBot',\r
'aipbot/','aipbot',\r
'AlphaBot','AlphaBot',\r
+'anthropic\-ai/','anthropic-ai',\r
'Apache\-HttpClient/','Apache-HttpClient',\r
'Apexoo[\x20]Spider','Apexoo Spider',\r
'Applebot/','Applebot',\r
+'Applebot\-Extended/','Applebot-Extended',\r
'arcemedia','AdsBot-ArceMedia',\r
'archive\.org_bot','archive.org_bot',\r
'Babya[\x20]Discoverer','Babya Discoverer',\r
'bitlybot','bit.ly',\r
'bl\.uk_lddc_bot/','bl.uk_lddc_bot',\r
'BLEXBot/','BLEXBot',\r
-'bnf.fr_bot','bnf.fr_bot',\r
+'bnf\.fr_bot','bnf.fr_bot',\r
'boitho\.com\-dc/','boitho.com-dc',\r
'BoogleBot','BoogleBot',\r
'BusinessBot:','BusinessBot:',\r
'BW/','BW',\r
-'Bytespider','Bytespider',\r
+'Bytespider/','Bytespider (Bytedance)',\r
'CatchBot/','CatchBot',\r
'CB/Nutch','CB/Nutch',\r
-'CCBot/','CCBot',\r
+'CCBot/','CCBot (Common Crawl, open/free AI dataset)',\r
+'ChatGPT\-User/','ChatGPT-User (OpenAI)',\r
'CheckMarkNetwork/','CheckMarkNetwork',\r
+'ClaudeBot/','ClaudeBot (Anthropic)',\r
+'claude\-web/','claude-web (Anthropic)',\r
'Cliqzbot/','Cliqzbot',\r
'CMS[\x20]Crawler','CMS Crawler',\r
+'cohere\-ai/','cohere-ai',\r
'Companybook\-Crawler','Companybook-Crawler',\r
'ConveraCrawler/','ConveraCrawler',\r
-'Contacts-Crawler','Contacts-Crawler',\r
+'Contacts\-Crawler','Contacts-Crawler',\r
'contxbot','contxbot',\r
'cosmos/','cosmos',\r
'CRMNLCrawlAgent','CRMNLCrawlAgent',\r
'daum','daum',\r
'Deepnet[\x20]Explorer','Deepnet Explorer',\r
'DeuSu/','DeuSu',\r
+'Diffbot/', 'Diffbot',\r
'Digincore','Digincore',\r
'Discordbot/','Discordbot',\r
'Dispatch/','Dispatch',\r
'DomainSONOCrawler/','DomainSONOCrawler',\r
'DomainStatsBot/','DomainStatsBot',\r
'DotBot/','DotBot',\r
-'DuckDuckBot-Https','DuckDuckBot-Https',\r
+'DuckAssistBot/', 'DuckAssist (DuckDuckGo AI)',\r
+'DuckDuckBot\-Https','DuckDuckBot-Https',\r
'DuckDuckBot','DuckDuckBot',\r
'DuckDuckGo\-Favicons\-Bot/','DuckDuckGo-Favicons-Bot',\r
'ELinks/','ELinks',\r
'Exabot/','Exabot',\r
'ExtLinksBot','ExtLinksBot',\r
'ExperianCrawlUK','ExperianCrawlUK',\r
-'facebookexternalhit/','facebookexternalhit',\r
+'facebookexternalhit/','facebookexternalhit (Meta/Facebook/Instagram shared link)',\r
'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise crawleradmin.t-info@telekom.de',\r
'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise T-Info_BI_cluster crawleradmin.t-info@telekom.de',\r
'FAST\-WebCrawler/','FAST-WebCrawler',\r
'Jooblebot','Jooblebot',\r
'KomodiaBot/','KomodiaBot',\r
'Konqueror/','Konqueror',\r
+'laserlikebot','laserlikebot',\r
'Lightspeed','Lightspeed',\r
'linkapediabot','linkapediabot',\r
'metager\-linkchecker','metager-linkchecker',\r
'MegaIndex\.ru/','MegaIndex.ru',\r
'merzscope','merzscope',\r
'Meta_Bot','Meta_Bot',\r
+'meta\-externalads/','meta-externalads (Meta/Facebook/Instagram) ads)',\r
+'meta\-externalagent/','meta-externalagent (Meta/Facebook/Instagram indexer + AI)',\r
+'meta\-externalfetcher/','meta-externalfetcher (Meta/Facebook/Instagram user-initiated fetch)',\r
+'meta\-webindexer/','meta-webindexer (Meta/Facebook/Instagram indexer + AI)',\r
'mfibot/','mfibot',\r
'microsoft.*discovery','Microsoft Office Protocol Discovery',\r
'missigua_locator','missigua_locator',\r
+'MistralAI\-User/','MistralAI-User',\r
'MixrankBot','MixrankBot',\r
'MJ12bot/','MJ12bot',\r
-'mojeek','mojeek',\r
'MojeekBot/','MojeekBot',\r
'Mojolicious','Mojolicious',\r
'MXT/Nutch','MXT/Nutch',\r
'NerdyBot','NerdyBot',\r
'netEstate[\x20]NE[\x20]Crawler','netEstate NE Crawler',\r
'NetResearchServer/','NetResearchServer',\r
-'Nimbostratus-Bot','Nimbostratus-Bot',\r
+'Nimbostratus\-Bot','Nimbostratus-Bot',\r
'nominet','nominet',\r
'NRLCorpusBuilder/Nutch','NRLCorpusBuilder/Nutch',\r
'nutch\-1\.4/','nutch-1.4',\r
'NutchCVS/','NutchCVS',\r
'o\.uk[\x20]robot','o uk.robot',\r
'ocrawler;','ocrawler;',\r
+'OAI\-SearchBot/','OAI-SearchBot (OpenAI)',\r
'ODP[\x20]link[\x20]checker','ODP link checker',\r
'Offline[\x20]Explorer/','Offline Explorer',\r
+'omgili/','omgili (webz.io)',\r
'OmniExplorer_Bot/','OmniExplorer_Bot',\r
'OrangeBot/','OrangeBot',\r
'Orliac','Orliac',\r
'pdffillerbot/','pdffillerbot',\r
'peopleman','peopleman',\r
'PetalBot','PetalBot',\r
+'PerplexityBot/','PerplexityBot',\r
+'Perplexity\-User/', 'Perplexity-User',\r
'PhantomJS','PhantomJS',\r
'PHP/5\.2\.8','PHP/5.2.8',\r
'Pinterestbot','Pinterestbot',\r
'python_wk_crawler','python_wk_crawler',\r
'Python\-urllib/','Python-urllib',\r
'QCrawl','QCrawl',\r
-'Quick-Crawler','Quick-Crawler',\r
+'Quick\-Crawler','Quick-Crawler',\r
'ResearchBot','ResearchBot',\r
'roboto','roboto',\r
'rogerbot/','rogerbot',\r
'Seekmo','Seekmo',\r
'semanticbot','semanticbot',\r
'SemrushBot/','SemrushBot',\r
-'SemrushBot-SI','SemrushBot-SI',\r
+'SemrushBot\-SI','SemrushBot-SI',\r
'seo\-audit\-check\-bot/','seo-audit-check-bot',\r
'Seobility','Seobility',\r
'SEOkicks\-Robot','SEOkicks-Robot',\r
'SpuhexBot','SpuhexBot',\r
'spyonweb','spyonweb',\r
'ssearch_bot','ssearch_bot',\r
+'Storebot\-Google/','Storebot-Google',\r
'Streamline3Bot','Streamline3Bot',\r
'SurdotlyBot/','SurdotlyBot',\r
'SurveyBot/','SurveyBot',\r
'Test[\x20]Spider','Test Spider',\r
'TestCrawler','TestCrawler',\r
'The[\x20]Knowledge[\x20]AI', 'The Knowledge AI',\r
+'Timpibot/','Timpibot (timpi.io)',\r
'TkBot','TkBot',\r
'tracemyfile','tracemyfile',\r
'trendiction','trendiction',\r
'TurnitinBot','TurnitinBot',\r
'TweetmemeBot/','TweetmemeBot',\r
'UCY/Nutch','UCY/Nutch',\r
-'uni-leipzig\.de','uni-leipzig.de',\r
+'uni\-leipzig\.de','uni-leipzig.de',\r
'Uptimebot/','Uptimebot',\r
'UptimeRobot/','UptimeRobot',\r
'URL[\x20]Checker','URL Checker',\r
'yak','yak-linkfluence',\r
'YisouSpider','YisouSpider',\r
'yoozBot','yoozBot',\r
+'YouBot','YouBot (You.com)',\r
'Your\-Website\-Sucks','Your-Website-Sucks',\r
'zoominfobot','zoominfobot',\r
'zspider/','zspider',\r
'extreme[_+\s]picture[_+\s]finder','extreme picture finder',\r
'ezoom','ezoom',\r
'ezresult','ezresult',\r
-'facebook','facebook',\r
+'FacebookBot/','FacebookBot',\r
'facebot','facebot',\r
'fast\-search\-engine','fast-search-engine',\r
'matrix_s\.p\.a\._\-_fast_enterprise_crawler','matrix_s.p.a._-_fast_enterprise_crawler',\r