From: Laurent Destailleur Date: Thu, 20 Nov 2014 09:40:01 +0000 (+0100) Subject: Fix: No spaces into bot key. X-Git-Tag: AWSTATS_7_5~62 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2d289e4178f6ad1c938b157ba5c687378ae96e0a;p=thirdparty%2FAWStats.git Fix: No spaces into bot key. --- diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index 5459bf2e..055b48b9 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -318,7 +318,7 @@ # added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ # -- fix - some robots were reported with _ where _ should have been a space. # changed Xenu Link Sleuth -# changed microsoft[_+ ]url[_+ ]control -> microsoft_url_control +# changed microsoft[_+\s]url[_+\s]control -> microsoft_url_control # changed favorites_sweeper -> favorites_sweeper # -- updates # updated AskJeeves to Ask @@ -353,7 +353,7 @@ # used to know in which order to search Robot IDs. # Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more # Minor robots are in list2, used when LevelForRobotsDetection is 2 or more -# Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+ ]' and are quoted. +# Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+\s]' and are quoted. #------------------------------------------------------- @RobotsSearchIDOrder_list1 = ( # Common robots (In robot file) @@ -366,16 +366,16 @@ 'googlebot\-image', 'googlebot', 'google\-sitemaps', -'google[_+ ]web[_+ ]preview', +'google[_+\s]web[_+\s]preview', 'grabber', 'gulliver', -'virus[_+ ]detector', # Must be before harvest +'virus[_+\s]detector', # Must be before harvest 'harvest', 'htdig', 'jeeves', 'linkwalker', 'lilina', -'lycos[_+ ]', +'lycos[_+\s]', 'moget', 'muscatferret', 'myweb', @@ -528,7 +528,7 @@ 'kapsi', 'katipo', 'kilroy', -'ko[_+ ]yappo[_+ ]robot', +'ko[_+\s]yappo[_+\s]robot', 'kummhttp', 'labelgrabber\.txt', 'larbin', @@ -614,7 +614,7 @@ 'snooper', 'solbot', 'speedy', -'spider[_+ ]monkey', +'spider[_+\s]monkey', 'spiderbot', 'spiderline', 'spiderman', @@ -709,8 +709,8 @@ 'betabot', 'biglotron', 'bittorrent_bot', -'biz360[_+ ]spider', -'blogbridge[_+ ]service', +'biz360[_+\s]spider', +'blogbridge[_+\s]service', 'bloglines', 'blogpulse', 'blogsearch', @@ -724,7 +724,7 @@ 'boris', 'bubing', 'bumblebee', -'candlelight[_+ ]favorites[_+ ]inspector', +'candlelight[_+\s]favorites[_+\s]inspector', 'careerbot', 'cbn00glebot', 'cerberian_drtrs', @@ -760,7 +760,7 @@ 'edgeio\-retriever', 'ets_v', 'exactseek', -'extreme[_+ ]picture[_+ ]finder', +'extreme[_+\s]picture[_+\s]finder', 'eventax', 'everbeecrawler', 'everest\-vulcan', @@ -807,7 +807,7 @@ 'hoowwwer', 'hpprint', 'htmlparser', -'html[_+ ]link[_+ ]validator', +'html[_+\s]link[_+\s]validator', 'httrack', 'hundesuche\.com\-bot', 'i-bot', @@ -819,7 +819,7 @@ 'infomine', 'insurancobot', 'integromedb\.org', -'internet[_+ ]ninja', +'internet[_+\s]ninja', 'internetarchive', 'internetseer', 'internetsupervision', @@ -828,7 +828,7 @@ 'isearch2006', 'istellabot', 'iupui_research_bot', -'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility', +'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility', 'justview', 'kalambot', 'kamano\.de_newsfeedverzeichnis', @@ -865,7 +865,7 @@ 'miadev', 'microsoft bits', 'microsoft.*discovery', # = 'microsoft (?:office (?:protocol|existence)|data access internet publishing provider protocol) discovery', -'microsoft[_+ ]url[_+ ]control', +'microsoft[_+\s]url[_+\s]control', 'mini\-reptile', 'minirank', 'missigua_locator', @@ -895,8 +895,8 @@ 'nutch', # Must come after other nutch versions 'ocelli', 'octora_beta_bot', -'omniexplorer[_+ ]bot', -'onet\.pl[_+ ]sa', +'omniexplorer[_+\s]bot', +'onet\.pl[_+\s]sa', 'onfolio', 'opentaggerbot', 'openwebspider', @@ -908,7 +908,7 @@ 'pear_http_request_class', 'peerbot', 'perman', -'php[_+ ]version[_+ ]tracker', +'php[_+\s]version[_+\s]tracker', 'pictureofinternet', 'ping\.blo\.gs', 'plinki', @@ -931,7 +931,7 @@ 'sbider', 'schizozilla', 'scumbot', -'searchguild[_+ ]dmoz[_+ ]experiment', +'searchguild[_+\s]dmoz[_+\s]experiment', 'searchmetricsbot', 'seekbot', 'semrushbot', @@ -988,7 +988,7 @@ 'vortex', 'vse\/', 'w3c\-checklink', -'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', +'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', 'w3c_validator', 'watchmouse', 'wavefire', @@ -1001,7 +1001,7 @@ 'webfilter', 'webindexer', 'webminer', -'website[_+ ]monitoring[_+ ]bot', +'website[_+\s]monitoring[_+\s]bot', 'webvulncrawl', 'wells_search', 'wesee:search', @@ -1074,13 +1074,13 @@ '^motorola$', 'movabletype', # These appear to be bots trying to hide. All of the usual architecture data is missing. -'^mozilla\/3\.0 \(compatible$', +'^mozilla\/3\.0\s\(compatible$', '^mozilla\/4\.0$', -'^mozilla\/4\.0 \(compatible;\)$', +'^mozilla\/4\.0\s\(compatible;\)$', '^mozilla\/5\.0$', -'^mozilla\/5\.0 \(compatible;$', -'^mozilla\/5\.0 \(en\-us\)$', -'^mozilla\/5\.0 firefox\/3\.0\.5$', +'^mozilla\/5\.0\s\(compatible;$', +'^mozilla\/5\.0\s\(en\-us\)$', +'^mozilla\/5\.0\sfirefox\/3\.0\.5$', '^msie', # End of hiding bots. 'netnewswire', @@ -1181,15 +1181,15 @@ 'googlebot','Googlebot', 'google\-sitemaps', 'Google Sitemaps', 'grabber', 'Grabber (SDSC)', -'google[_+ ]web[_+ ]preview', 'Google Web Preview', +'google[_+\s]web[_+\s]preview', 'Google Web Preview', 'gulliver','Northern Light Gulliver', -'virus[_+ ]detector','virus_detector', +'virus[_+\s]detector','virus_detector', 'harvest','Harvest', 'htdig','ht://Dig', 'jeeves','Ask', 'linkwalker','LinkWalker', 'lilina','Lilina', -'lycos[_+ ]','Lycos', +'lycos[_+\s]','Lycos', 'moget','moget', 'muscatferret','Muscat Ferret', 'myweb','Internet Shinchakubin', @@ -1346,7 +1346,7 @@ 'kapsi','image.kapsi.net', 'katipo','Katipo', 'kilroy','Kilroy', -'ko[_+ ]yappo[_+ ]robot','KO_Yappo_Robot', +'ko[_+\s]yappo[_+\s]robot','KO_Yappo_Robot', 'kummhttp','KummHttp', 'labelgrabber\.txt','LabelGrabber', 'larbin','larbin', @@ -1435,7 +1435,7 @@ 'snooper','Snooper', 'solbot','Solbot', 'speedy','Speedy Spider', -'spider[_+ ]monkey','Spider monkey', +'spider[_+\s]monkey','Spider monkey', 'spiderbot','SpiderBot', 'spiderline','Spiderline Crawler', 'spiderlytics', 'Spiderlytics: No homepage, e-mail only: spider (at) spiderlytics.com', @@ -1531,8 +1531,8 @@ 'betabot','BetaBot', 'biglotron','Biglotron', 'bittorrent_bot','BitTorrent Bot', -'biz360[_+ ]spider','Biz360 spider', -'blogbridge[_+ ]service','BlogBridge Service', +'biz360[_+\s]spider','Biz360 spider', +'blogbridge[_+\s]service','BlogBridge Service', 'bloglines','Bloglines', 'blogpulse','BlogPulse ISSpider intelliseek.com', 'blogsearch','BlogSearch', @@ -1546,7 +1546,7 @@ 'boris', 'Boris', 'bubing', 'BUbiNG', 'bumblebee', 'Bumblebee (relevare.com)', -'candlelight[_+ ]favorites[_+ ]inspector','Candlelight_Favorites_Inspector', +'candlelight[_+\s]favorites[_+\s]inspector','Candlelight_Favorites_Inspector', 'careerbot', 'CareerBot', 'cbn00glebot','cbn00glebot', 'cerberian_drtrs','Cerberian Drtrs', @@ -1581,7 +1581,7 @@ 'edgeio\-retriever','edgeio-retriever', 'ets_v','ETS Enterprise Translation Server', 'exactseek','ExactSeek Crawler', -'extreme[_+ ]picture[_+ ]finder','Extreme_Picture_Finder', +'extreme[_+\s]picture[_+\s]finder','Extreme_Picture_Finder', 'eventax','eventax', 'everbeecrawler','EverbeeCrawler', 'everest\-vulcan','Everest-Vulcan', @@ -1628,7 +1628,7 @@ 'hoowwwer','HooWWWer', 'hpprint','HPPrint', 'htmlparser','HTMLParser', -'html[_+ ]link[_+ ]validator','Html_Link_Validator', +'html[_+\s]link[_+\s]validator','Html_Link_Validator', 'httrack','HTTrack off-line browser', 'hundesuche\.com\-bot','Hundesuche.com-Bot', 'i-bot','i-bot', @@ -1640,7 +1640,7 @@ 'infomine','INFOMINE VLCrawler', 'insurancobot','InsurancoBot', 'integromedb\.org','IntegromeDB', -'internet[_+ ]ninja','Internet_Ninja ', +'internet[_+\s]ninja','Internet_Ninja ', 'internetarchive','InternetArchive', 'internetseer', 'InternetSeer', 'internetsupervision','InternetSupervision', @@ -1648,7 +1648,7 @@ 'isearch2006','isearch2006', 'istellabot', 'IstellaBot', 'iupui_research_bot','IUPUI_Research_Bot', -'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility','JRTwine_Software_Check_Favorites_Utility', +'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility','JRTwine_Software_Check_Favorites_Utility', 'justview', 'JustView', 'kalambot','KalamBot', 'kamano\.de_newsfeedverzeichnis','kamano.de NewsFeedVerzeichnis', @@ -1682,7 +1682,7 @@ 'miadev', 'MiaDev spider', 'microsoft bits', 'Microsoft Background Intelligent Transfer Service (BITS)?', 'microsoft.*discovery', 'Microsoft Office Protocol Discovery/Microsoft Office Existence Discovery', -'microsoft[_+ ]url[_+ ]control','Microsoft URL Control', +'microsoft[_+\s]url[_+\s]control','Microsoft URL Control', 'minirank','miniRank', 'mini\-reptile','Mini-reptile', 'missigua_locator','Missigua_Locator', @@ -1712,8 +1712,8 @@ 'nutch','Nutch', 'ocelli','Ocelli', 'octora_beta_bot','Octora Beta Bot', -'omniexplorer[_+ ]bot','OmniExplorer Bot', -'onet\.pl[_+ ]sa','Onet.pl_SA', +'omniexplorer[_+\s]bot','OmniExplorer Bot', +'onet\.pl[_+\s]sa','Onet.pl_SA', 'onfolio','Onfolio', 'opentaggerbot','OpenTaggerBot', 'openwebspider','OpenWebSpider', @@ -1725,7 +1725,7 @@ 'pear_http_request_class','PEAR HTTP Request class', 'peerbot','PEERbot', 'perman', 'Perman surfer', -'php[_+ ]version[_+ ]tracker','PHP version tracker', +'php[_+\s]version[_+\s]tracker','PHP version tracker', 'pictureofinternet','PictureOfInternet', 'ping\.blo\.gs','ping.blo.gs', 'plinki','plinki', @@ -1749,7 +1749,7 @@ 'sbider','SBIder', 'schizozilla','Schizozilla', 'scumbot','Scumbot', -'searchguild[_+ ]dmoz[_+ ]experiment','SearchGuild_DMOZ_Experiment', +'searchguild[_+\s]dmoz[_+\s]experiment','SearchGuild_DMOZ_Experiment', 'searchmetricsbot','SearchmetricsBot', 'seekbot','Seekbot', 'semrushbot', 'SemrushBot', @@ -1805,7 +1805,7 @@ 'vortex','VORTEX', 'vse\/','VSE', 'w3c\-checklink','W3C Link Checker', -'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', 'W3C jigsaw CSS Validator', +'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', 'W3C jigsaw CSS Validator', 'w3c_validator','W3C Validator', 'watchmouse', 'WatchMouse Website Monitor', 'wavefire','Wavefire', @@ -1820,7 +1820,7 @@ 'webfilter','WebFilter', 'webindexer','WebIndexer', 'webminer','WebMiner', -'website[_+ ]monitoring[_+ ]bot','Website_Monitoring_Bot', +'website[_+\s]monitoring[_+\s]bot','Website_Monitoring_Bot', 'webvulncrawl', 'WebVulnCrawl', 'wells_search','Wells Search', 'wesee:search', 'WeSEE Bot', @@ -1911,13 +1911,13 @@ 'microsoft\-webdav\-miniredir', 'microsoft-webdav-miniredir', '^motorola$', 'Suspected Bot masquerading as "Motorola"', 'movabletype', 'movabletype', -'^mozilla\/3\.0 \(compatible$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/3\.0\s\(compatible$', 'Suspected bot masqurading as Mozilla', '^mozilla\/4\.0$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/4\.0 \(compatible;\)$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/4\.0\s\(compatible;\)$', 'Suspected bot masqurading as Mozilla', '^mozilla\/5\.0$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 \(compatible;$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 \(en\-us\)$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 firefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\s\(compatible;$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\s\(en\-us\)$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\sfirefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', '^msie', 'Suspected bot masquerading as M$ IE', 'netnewswire', 'netnewswire', ' netseer ', 'Net Seer', @@ -1990,7 +1990,7 @@ 'fast\-webcrawler'=>'AllTheWeb', 'googlebot'=>'Google', 'google\-sitemap'=>'Google', -'google[_+ ]web[_+ ]preview'=>'Google', +'google[_+\s]web[_+\s]preview'=>'Google', 'msnbot'=>'MSN', 'nutch'=>'Looksmart', 'scooter'=>'AltaVista',