From: dpw0001 Date: Fri, 26 Sep 2025 11:51:13 +0000 (+0200) Subject: Added new (AI) bots to robots.pm. Also fixed some inconsitencies in the bot lists. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=97892237d664aed21db468d5f498ba000509208d;p=thirdparty%2FAWStats.git Added new (AI) bots to robots.pm. Also fixed some inconsitencies in the bot lists. --- diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index a7b22451..f6903aca 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -43,6 +43,47 @@ # #------------------------------------------------------- +# 2025-09-26 Daniel-Percy Wimpff +# Added AI2Bot (Allen Institute, https://allenai.org/crawler) +# Added Applebot-Extended (Apple AI indexer) +# Added anthropic-ai (Anthropic) +# Added ChatGPT-User (OpenAI on demand URL fetcher, https://openai.com/bot) +# Added ClaudeBot (Anthropic) +# Added claude-web (Anthropic) +# Added cohere-ai (Cohere, accordings to https://momenticmarketing.com/blog/ai-search-crawlers-bots) +# Added Diffbot +# Added DuckAssistBot (DuckDuckGo AI) +# Added facebookcatalog (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) +# Added FacebookBot (Meta, according to https://momenticmarketing.com/blog/ai-search-crawlers-bots) +# Added Google-CloudVertexBot (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers) +# Added Google-Extended (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers) +# Added Google-InspectionTool (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers) +# Added GoogleOther (also fetches GoogleOther-Image, GoogleOther-Video, https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers) +# Added meta-externalads (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) +# Added meta-externalagent (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) +# Added meta-externalfetcher (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) +# Added meta-webindexer (Meta, https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) +# Added MistralAI-User (mistral.ai) +# Added OAI-SearchBot (OpenAI indexer, https://openai.com/searchbot) +# Added omgili (Webz.io, https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) +# Added PerplexityBot (https://perplexity.ai/perplexitybot) +# Added Perplexity-User (https://perplexity.ai/perplexitybot) +# Added Storebot-Google (https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers) +# Added Timpibot (http://www.timpi.io) +# Added YouBot (You.com) +# +# Edited description for facebookexternalhit in %RobotsHashIDLib +# Edited entry and description for Bytespider +# Edited description for CCBot +# +# Removed facebook (not documented by Meta - substituted by adding entries for FacebookBot and meta-... crawlers) +# +# Fixed: Missing entry for laserlikebot in %RobotsHashIDLib +# Fixed: Entries for MojeekBot differ in @RobotsSearchIDOrder_list1 and %RobotsHashIDLib. Using variant with trailing slash now. +# Fixed: Unescaped dashes (-) in entries Mediapartners-Google, Baiduspider-, Contacts-Crawler, DuckDuckBot-Https, Nimbostratus-Bot, Quick-Crawler, SemrushBot-SI, uni-leipzig.de +# Fixed: Unescaped dot (.) in entry bnf.fr_bot +# Fixed: Removed deprecated entry mojeek from %RobotsHashIDLib + # 2024-07-17 Dinko Sotirov # Added Amazonbot (https://developer.amazon.com/support/amazonbot) # Added GPTBot (https://openai.com/gptbot) @@ -442,13 +483,17 @@ 'Googlebot\-Image/', 'Googlebot\-Mobile/', 'Google[\x20]Page[\x20]Speed', +'Google\-Extended/', +'Google\-InspectionTool/', 'google\-sitemaps', 'Googlebot\-News', 'Googlebot\-Video/', +'GoogleOther', +'Google\-CloudVertexBot', 'AdsBot\-Google[\x20]\(', 'AdsBot\-Google\-Mobile\-Apps', 'Adsbot', -'Mediapartners-Google', +'Mediapartners\-Google', 'Feedfetcher\-Google', 'Google\-Adwords\-Instant', 'Firefox/1\.5', @@ -456,7 +501,7 @@ 'Yahoo![\x20]Slurp', 'Baiduspider/', 'Baiduspider\-image', -'Baiduspider-', +'Baiduspider\-', 'YandexBot/', 'YandexImages/', 'YandexImageResizer', @@ -481,12 +526,15 @@ 'arcemedia', 'AdnormCrawlerCatchBot/', 'adscanner', +'AI2Bot', 'aiHitBot/', 'aipbot/', 'AlphaBot', +'anthropic\-ai/', 'Apache\-HttpClient/', 'Apexoo[\x20]Spider', 'Applebot/', +'Applebot\-Extended/', 'archive\.org_bot', 'Babya[\x20]Discoverer', 'Barkrowler', @@ -500,21 +548,25 @@ 'bitlybot', 'bl\.uk_lddc_bot/', 'BLEXBot/', -'bnf.fr_bot', +'bnf\.fr_bot', 'boitho\.com\-dc/', 'BoogleBot', 'BusinessBot:', 'BW/', -'Bytespider', +'Bytespider/', 'CatchBot/', 'CB/Nutch', 'CCBot/', +'ChatGPT\-User/', 'CheckMarkNetwork/', +'ClaudeBot/', +'claude\-web/', 'Cliqzbot/', 'CMS[\x20]Crawler', +'cohere\-ai/', 'Companybook\-Crawler', 'ConveraCrawler/', -'Contacts-Crawler', +'Contacts\-Crawler', 'contxbot', 'cosmos/', 'crawl/Nutch', @@ -531,6 +583,7 @@ 'daum', 'Deepnet[\x20]Explorer', 'DeuSu/', +'Diffbot/', 'Digincore', 'Discordbot/', 'Dispatch/', @@ -542,7 +595,8 @@ 'DomainSONOCrawler/', 'DomainStatsBot/', 'DotBot/', -'DuckDuckBot-Https', +'DuckAssistBot/', +'DuckDuckBot\-Https', 'DuckDuckBot', 'DuckDuckGo\-Favicons\-Bot/', 'ELinks/', @@ -636,12 +690,17 @@ 'MegaIndex\.ru/', 'merzscope', 'Meta_Bot', +'meta\-externalads/', +'meta\-externalagent/', +'meta\-externalfetcher/', +'meta\-webindexer/', 'mfibot/', 'microsoft.*discovery', 'missigua_locator', +'MistralAI\-User/', 'MixrankBot', 'MJ12bot/', -'MojeekBot', +'MojeekBot/', 'Mojolicious', 'MXT/Nutch', 'My[\x20]Nutch[\x20]Spider/', @@ -651,16 +710,18 @@ 'NerdyBot', 'netEstate[\x20]NE[\x20]Crawler', 'NetResearchServer/', -'Nimbostratus-Bot', +'Nimbostratus\-Bot', 'nominet', 'NRLCorpusBuilder/Nutch', 'nutch\-1\.4/', 'nutch\-1\.8/', 'NutchCVS/', 'o\.uk[\x20]robot', +'OAI\-SearchBot/', 'ocrawler;', 'ODP[\x20]link[\x20]checker', 'Offline[\x20]Explorer/', +'omgili/', 'OmniExplorer_Bot/', 'OrangeBot/', 'Orliac', @@ -670,6 +731,8 @@ 'pdffillerbot/', 'peopleman', 'PetalBot', +'PerplexityBot/', +'Perplexity\-User/', 'PhantomJS', 'PHP/5\.2\.8', 'Pinterestbot', @@ -685,7 +748,7 @@ 'python_wk_crawler', 'Python\-urllib/', 'QCrawl', -'Quick-Crawler', +'Quick\-Crawler', 'ResearchBot', 'roboto', 'rogerbot/', @@ -705,7 +768,7 @@ 'Seekmo', 'semanticbot', 'SemrushBot/', -'SemrushBot-SI', +'SemrushBot\-SI', 'seo\-audit\-check\-bot/', 'Seobility', 'SEOkicks\-Robot', @@ -726,6 +789,7 @@ 'SpuhexBot', 'spyonweb', 'ssearch_bot', +'Storebot\-Google/', 'Streamline3Bot', 'SurdotlyBot/', 'SurveyBot/', @@ -737,6 +801,7 @@ 'Test[\x20]Spider', 'TestCrawler', 'The[\x20]Knowledge[\x20]AI', +'Timpibot/', 'TkBot', 'tracemyfile', 'trendiction', @@ -744,7 +809,7 @@ 'TurnitinBot', 'TweetmemeBot/', 'UCY/Nutch', -'uni-leipzig\.de', +'uni\-leipzig\.de', 'Uptimebot/', 'UptimeRobot/', 'URL[\x20]Checker', @@ -782,6 +847,7 @@ 'yak', 'YisouSpider', 'yoozBot', +'YouBot', 'Your\-Website\-Sucks', 'zoominfobot', 'zspider/', @@ -992,7 +1058,7 @@ 'extreme[_+\s]picture[_+\s]finder', 'ezoom', 'ezresult', -'facebook', +'FacebookBot/', 'facebot', 'fast\-search\-engine', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler', @@ -1625,9 +1691,13 @@ 'Googlebot\-Image/','Googlebot-Image', 'Googlebot\-Mobile/','Googlebot-Mobile', 'Google[\x20]Page[\x20]Speed','Google Page Speed', +'Google\-Extended/', 'Google-Extended (AI indexer, Gemini)', +'Google\-InspectionTool/', 'Google-InspectionTool', 'google\-sitemaps','google-sitemaps', 'Googlebot\-News','Googlebot-News', 'Googlebot\-Video/','Googlebot-Video', +'GoogleOther', 'GoogleOther / GoogleOther-Image / GoogleOther-Video', +'Google\-CloudVertexBot','Google-CloudVertexBot', 'AdsBot\-Google[\x20]\(','AdsBot-Google', 'AdsBot\-Google\-Mobile\-Apps','AdsBot-Google-Mobile-Apps', 'Adsbot','Adsbot', @@ -1639,7 +1709,7 @@ 'Yahoo![\x20]Slurp','Yahoo! Slurp', 'Baiduspider/','Baiduspider', 'Baiduspider\-image','Baiduspider-image', -'Baiduspider-','Baiduspider ( catchall )', +'Baiduspider\-','Baiduspider ( catchall )', 'YandexBot/','YandexBot', 'YandexImages/','YandexImages', 'YandexImageResizer','YandexImageResizer', @@ -1663,12 +1733,15 @@ 'Accoona\-AI\-Agent/','Accoona-AI-Agent', 'AdnormCrawlerCatchBot/','AdnormCrawlerCatchBot', 'adscanner','adscanner', +'AI2Bot','AI2Bot (Allen Institute)', 'aiHitBot/','aiHitBot', 'aipbot/','aipbot', 'AlphaBot','AlphaBot', +'anthropic\-ai/','anthropic-ai', 'Apache\-HttpClient/','Apache-HttpClient', 'Apexoo[\x20]Spider','Apexoo Spider', 'Applebot/','Applebot', +'Applebot\-Extended/','Applebot-Extended', 'arcemedia','AdsBot-ArceMedia', 'archive\.org_bot','archive.org_bot', 'Babya[\x20]Discoverer','Babya Discoverer', @@ -1683,21 +1756,25 @@ 'bitlybot','bit.ly', 'bl\.uk_lddc_bot/','bl.uk_lddc_bot', 'BLEXBot/','BLEXBot', -'bnf.fr_bot','bnf.fr_bot', +'bnf\.fr_bot','bnf.fr_bot', 'boitho\.com\-dc/','boitho.com-dc', 'BoogleBot','BoogleBot', 'BusinessBot:','BusinessBot:', 'BW/','BW', -'Bytespider','Bytespider', +'Bytespider/','Bytespider (Bytedance)', 'CatchBot/','CatchBot', 'CB/Nutch','CB/Nutch', -'CCBot/','CCBot', +'CCBot/','CCBot (Common Crawl, open/free AI dataset)', +'ChatGPT\-User/','ChatGPT-User (OpenAI)', 'CheckMarkNetwork/','CheckMarkNetwork', +'ClaudeBot/','ClaudeBot (Anthropic)', +'claude\-web/','claude-web (Anthropic)', 'Cliqzbot/','Cliqzbot', 'CMS[\x20]Crawler','CMS Crawler', +'cohere\-ai/','cohere-ai', 'Companybook\-Crawler','Companybook-Crawler', 'ConveraCrawler/','ConveraCrawler', -'Contacts-Crawler','Contacts-Crawler', +'Contacts\-Crawler','Contacts-Crawler', 'contxbot','contxbot', 'cosmos/','cosmos', 'CRMNLCrawlAgent','CRMNLCrawlAgent', @@ -1714,6 +1791,7 @@ 'daum','daum', 'Deepnet[\x20]Explorer','Deepnet Explorer', 'DeuSu/','DeuSu', +'Diffbot/', 'Diffbot', 'Digincore','Digincore', 'Discordbot/','Discordbot', 'Dispatch/','Dispatch', @@ -1725,7 +1803,8 @@ 'DomainSONOCrawler/','DomainSONOCrawler', 'DomainStatsBot/','DomainStatsBot', 'DotBot/','DotBot', -'DuckDuckBot-Https','DuckDuckBot-Https', +'DuckAssistBot/', 'DuckAssist (DuckDuckGo AI)', +'DuckDuckBot\-Https','DuckDuckBot-Https', 'DuckDuckBot','DuckDuckBot', 'DuckDuckGo\-Favicons\-Bot/','DuckDuckGo-Favicons-Bot', 'ELinks/','ELinks', @@ -1738,7 +1817,7 @@ 'Exabot/','Exabot', 'ExtLinksBot','ExtLinksBot', 'ExperianCrawlUK','ExperianCrawlUK', -'facebookexternalhit/','facebookexternalhit', +'facebookexternalhit/','facebookexternalhit (Meta/Facebook/Instagram shared link)', 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise crawleradmin.t-info@telekom.de', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise T-Info_BI_cluster crawleradmin.t-info@telekom.de', 'FAST\-WebCrawler/','FAST-WebCrawler', @@ -1796,6 +1875,7 @@ 'Jooblebot','Jooblebot', 'KomodiaBot/','KomodiaBot', 'Konqueror/','Konqueror', +'laserlikebot','laserlikebot', 'Lightspeed','Lightspeed', 'linkapediabot','linkapediabot', 'metager\-linkchecker','metager-linkchecker', @@ -1818,12 +1898,16 @@ 'MegaIndex\.ru/','MegaIndex.ru', 'merzscope','merzscope', 'Meta_Bot','Meta_Bot', +'meta\-externalads/','meta-externalads (Meta/Facebook/Instagram) ads)', +'meta\-externalagent/','meta-externalagent (Meta/Facebook/Instagram indexer + AI)', +'meta\-externalfetcher/','meta-externalfetcher (Meta/Facebook/Instagram user-initiated fetch)', +'meta\-webindexer/','meta-webindexer (Meta/Facebook/Instagram indexer + AI)', 'mfibot/','mfibot', 'microsoft.*discovery','Microsoft Office Protocol Discovery', 'missigua_locator','missigua_locator', +'MistralAI\-User/','MistralAI-User', 'MixrankBot','MixrankBot', 'MJ12bot/','MJ12bot', -'mojeek','mojeek', 'MojeekBot/','MojeekBot', 'Mojolicious','Mojolicious', 'MXT/Nutch','MXT/Nutch', @@ -1834,7 +1918,7 @@ 'NerdyBot','NerdyBot', 'netEstate[\x20]NE[\x20]Crawler','netEstate NE Crawler', 'NetResearchServer/','NetResearchServer', -'Nimbostratus-Bot','Nimbostratus-Bot', +'Nimbostratus\-Bot','Nimbostratus-Bot', 'nominet','nominet', 'NRLCorpusBuilder/Nutch','NRLCorpusBuilder/Nutch', 'nutch\-1\.4/','nutch-1.4', @@ -1842,8 +1926,10 @@ 'NutchCVS/','NutchCVS', 'o\.uk[\x20]robot','o uk.robot', 'ocrawler;','ocrawler;', +'OAI\-SearchBot/','OAI-SearchBot (OpenAI)', 'ODP[\x20]link[\x20]checker','ODP link checker', 'Offline[\x20]Explorer/','Offline Explorer', +'omgili/','omgili (webz.io)', 'OmniExplorer_Bot/','OmniExplorer_Bot', 'OrangeBot/','OrangeBot', 'Orliac','Orliac', @@ -1853,6 +1939,8 @@ 'pdffillerbot/','pdffillerbot', 'peopleman','peopleman', 'PetalBot','PetalBot', +'PerplexityBot/','PerplexityBot', +'Perplexity\-User/', 'Perplexity-User', 'PhantomJS','PhantomJS', 'PHP/5\.2\.8','PHP/5.2.8', 'Pinterestbot','Pinterestbot', @@ -1868,7 +1956,7 @@ 'python_wk_crawler','python_wk_crawler', 'Python\-urllib/','Python-urllib', 'QCrawl','QCrawl', -'Quick-Crawler','Quick-Crawler', +'Quick\-Crawler','Quick-Crawler', 'ResearchBot','ResearchBot', 'roboto','roboto', 'rogerbot/','rogerbot', @@ -1888,7 +1976,7 @@ 'Seekmo','Seekmo', 'semanticbot','semanticbot', 'SemrushBot/','SemrushBot', -'SemrushBot-SI','SemrushBot-SI', +'SemrushBot\-SI','SemrushBot-SI', 'seo\-audit\-check\-bot/','seo-audit-check-bot', 'Seobility','Seobility', 'SEOkicks\-Robot','SEOkicks-Robot', @@ -1909,6 +1997,7 @@ 'SpuhexBot','SpuhexBot', 'spyonweb','spyonweb', 'ssearch_bot','ssearch_bot', +'Storebot\-Google/','Storebot-Google', 'Streamline3Bot','Streamline3Bot', 'SurdotlyBot/','SurdotlyBot', 'SurveyBot/','SurveyBot', @@ -1920,6 +2009,7 @@ 'Test[\x20]Spider','Test Spider', 'TestCrawler','TestCrawler', 'The[\x20]Knowledge[\x20]AI', 'The Knowledge AI', +'Timpibot/','Timpibot (timpi.io)', 'TkBot','TkBot', 'tracemyfile','tracemyfile', 'trendiction','trendiction', @@ -1927,7 +2017,7 @@ 'TurnitinBot','TurnitinBot', 'TweetmemeBot/','TweetmemeBot', 'UCY/Nutch','UCY/Nutch', -'uni-leipzig\.de','uni-leipzig.de', +'uni\-leipzig\.de','uni-leipzig.de', 'Uptimebot/','Uptimebot', 'UptimeRobot/','UptimeRobot', 'URL[\x20]Checker','URL Checker', @@ -1965,6 +2055,7 @@ 'yak','yak-linkfluence', 'YisouSpider','YisouSpider', 'yoozBot','yoozBot', +'YouBot','YouBot (You.com)', 'Your\-Website\-Sucks','Your-Website-Sucks', 'zoominfobot','zoominfobot', 'zspider/','zspider', @@ -2172,7 +2263,7 @@ 'extreme[_+\s]picture[_+\s]finder','extreme picture finder', 'ezoom','ezoom', 'ezresult','ezresult', -'facebook','facebook', +'FacebookBot/','FacebookBot', 'facebot','facebot', 'fast\-search\-engine','fast-search-engine', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler','matrix_s.p.a._-_fast_enterprise_crawler',