From: eldy <> Date: Sat, 24 Aug 2013 19:11:32 +0000 (+0000) Subject: Update robot, search engines database. X-Git-Tag: AWSTATS_7_3~54 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bb39440fbfdbbfa5919968d0fca75db7443c969a;p=thirdparty%2FAWStats.git Update robot, search engines database. --- diff --git a/docs/awstats_changelog.txt b/docs/awstats_changelog.txt index 14306c5e..a77fe863 100644 --- a/docs/awstats_changelog.txt +++ b/docs/awstats_changelog.txt @@ -8,6 +8,7 @@ $Revision$ - $Author$ - $Date$ New features: - #877 Windows 8 + iOS Support in AWStats - Add command line option -version +- Update robot, search engines database. ***** 7.2 ***** diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index a58361c4..6581b622 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -39,10 +39,10 @@ # updated YahooSeeker description (blog crawler) # 2005-09-16 added link for http://linkchecker.sourceforge.net # added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl) -# added Blogslive info@blogslive.com intelliseek.com +# added Blogslive info@blogslive.com intelliseek.com # added BlogPulse (ISSpider-3.0) intelliseek.com # 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html) -# added EverbeeCrawler +# added EverbeeCrawler # added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html # added link for Bloglines http://www.bloglines.com # 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html) @@ -91,11 +91,11 @@ # added EARTHCOM.info www.earthcom.info # added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor] # added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor] -# 2006-01-01 +# 2006-01-01 # added Dulance http://www.dulance.com/bot.jsp # added MojeekBot http://www.mojeek.com/bot.html # added nicebot http://www.egghelp.org/setup.htm ? -# added Snappy http://www.urltrends.com/faq.php +# added Snappy http://www.urltrends.com/faq.php # added sohu agent # added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net] # added zspider http://feedback.redkolibri.com/ @@ -111,63 +111,63 @@ # added LetsCrawl.com http://letscrawl.com # added ichiro http://help.goo.ne.jp/door/crawlerE.html # 2006-01-27 additional 22 robots from a list provided by Moizes Gabor -# added ALeadSoftbot http://www.aleadsoft.com/bot.htm -# added CipinetBot http://www.cipinet.com/bot.html -# added Cuasarbot http://www.cuasar.com/ -# added Dumbot http://www.dumbfind.com/ -# added Extreme_Picture_Finder http://www.exisoftware.com/ -# added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots +# added ALeadSoftbot http://www.aleadsoft.com/bot.htm +# added CipinetBot http://www.cipinet.com/bot.html +# added Cuasarbot http://www.cuasar.com/ +# added Dumbot http://www.dumbfind.com/ +# added Extreme_Picture_Finder http://www.exisoftware.com/ +# added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots # added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html bot@iltrovatore.it -# added InsurancoBot http://www.fastspywareremoval.com/ +# added InsurancoBot http://www.fastspywareremoval.com/ # added InternetArchive http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org -# added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca +# added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca # added Kurzor http://www.easymail.hu/ cursor@easymail.hu # added NutchCVS http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org # added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org -# added Orbiter http://www.dailyorbit.com/bot.htm -# added PHP_version_tracker http://www.nexen.net/phpversion/bot.php -# added SuperBot http://www.sparkleware.com/superbot/ +# added Orbiter http://www.dailyorbit.com/bot.htm +# added PHP_version_tracker http://www.nexen.net/phpversion/bot.php +# added SuperBot http://www.sparkleware.com/superbot/ # added SynooBot http://www.synoo.de/bot.html webmaster@synoo.com -# added TestBot http://www.agbrain.com/ -# added TutorGigBot http://www.tutorgig.info/ -# added WebIndexer mailto://webindexerv1@yahoo.com +# added TestBot http://www.agbrain.com/ +# added TutorGigBot http://www.tutorgig.info/ +# added WebIndexer mailto://webindexerv1@yahoo.com # added WebMiner http://64.124.122.252/feedback.html -# 2006-02-01 +# 2006-02-01 # added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202 # added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164 # additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ] # added Candlelight_Favorites_Inspector -# added DomainChecker -# added EasyDL -# added FavOrg +# added DomainChecker +# added EasyDL +# added FavOrg # added Favorites_Sweeper # added Html_Link_Validator -# added Internet_Ninja +# added Internet_Ninja # added JRTwine_Software_Check_Favorites_Utility # fixed Microsoft_URL_Control -# added miniRank +# added miniRank # added Missigua_Locator -# added NPBot -# added Ocelli -# added Onet.pl_SA -# added proodleBot -# added SearchGuild_DMOZ_Experiment -# added Susie +# added NPBot +# added Ocelli +# added Onet.pl_SA +# added proodleBot +# added SearchGuild_DMOZ_Experiment +# added Susie # added Website_Monitoring_Bot # added Xenu_Link_Sleuth # 2006-05-15 # added ASPseek http://www.aspseek.org/ -# added AdamM Bot http://home.blic.net/adamm/ +# added AdamM Bot http://home.blic.net/adamm/ # added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html # added arianna.libero.it (Italian Portal/search engine) # added Biz360 spider http://www.biz360.com # added BlogBridge Service http://www.blogbridge.com/ -# added BlogSearch http://www.icerocket.com/ +# added BlogSearch http://www.icerocket.com/ # added libcrawl # added edgeio-relanshanbottriever http://www.edgeio.com # added FeedFlow http://feedflow.com/about # added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt -# added Java catchall - used by many spam bots +# added Java catchall - used by many spam bots # added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb # added msnbot-media http://search.msn.com/msnbot.htm # added MT::Telegraph::Agent @@ -197,7 +197,7 @@ # added ActiveBookmark http://www.libmaster.com/active_bookmark.php # added BIGLOTRON http://www.biglotron.com/robot.html # added Bookmark-Manager http://bkm.sourceforge.net/ -# added cbn00glebot +# added cbn00glebot # added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240 # added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork # added CheckWeb link validator http://p.duby.free.fr/chkweb.htm @@ -205,9 +205,9 @@ # added ConveraCrawler http://www.authoritativeweb.com/crawl/ # added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/ # added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php -# added Cursor http://adcenter.hu/docs/en/bot.html +# added Cursor http://adcenter.hu/docs/en/bot.html # added Custo http://www.netwu.com/custo/ -# added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ +# added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ # added Deepindex http://www.deepindex.net/faq.php # added DNSGroup http://www.dnsgroup.com/ # added DoCoMo http://www.nttdocomo.co.jp/ @@ -219,13 +219,13 @@ # added FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de http://www.telekom.de/ # added FeedValidator http://feedvalidator.org/ # added FilmkameraBot http://www.filmkamera.at/bot.html -# added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece +# added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece # added Global Fetch http://www.wesonet.com/ # added GOFORITBOT http://www.goforit.com/about/ # added GoForIt.com http://www.goforit.com/about/ # added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php # added HooWWWer http://cosco.hiit.fi/search/hoowwwer/ -# added HPPrint +# added HPPrint # added HTMLParser http://htmlparser.sourceforge.net/ # added Hundesuche.com-Bot http://www.hundesuche.com/ # added InfoBot http://www.infobot.org/ @@ -247,10 +247,10 @@ # added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/ # added Megite http://www.megite.com/ # added Metaspinner http://index.meta-spinner.de/ -# added Mini-reptile +# added Mini-reptile # added Misterbot http://www.misterbot.fr/ # added Miva http://www.miva.com/ -# added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b +# added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b # added MSRBOT http://research.microsoft.com/research/sv/msrbot/ # added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022 # added Mydoyouhike http://www.doyouhike.net/my @@ -267,14 +267,14 @@ # added PictureOfInternet http://malfunction.org/poi/ # added plinki http://www.plinki.com/ # added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b -# added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b -# added ProjectWF-java-test-crawler +# added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b +# added ProjectWF-java-test-crawler # added PyQuery http://sourceforge.net/projects/pyquery/ -# added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ +# added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ # added Scumbot # added Sensis Web Crawler http://www.sensis.com.au/ # added snap.com beta crawler http://www.snap.com/ -# added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ +# added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ # added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm # added Suchfin-Bot http://www.suchfin.de/ # added Sunrise http://www.sunrisexp.com/ @@ -287,15 +287,15 @@ # added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com) # added updated http://www.updated.com/ # added Vermut http://vermut.aol.com -# added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html +# added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html # added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb # added VSE http://www.vivisimo.com/ # added webcrawl.net http://www.webcrawl.net/ # added Web Downloader http://www.krasu.ru/soft/chuchelo/ # added Webdup http://www.webdup.com/en/index.html -# added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b +# added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b # added WordPress http://wordpress.org/ -# added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ +# added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ # added Xenu's Link Sleuth (with ') # added xirq http://www.xirq.com/ # added yoogliFetchAgent http://www.yoogli.com/ @@ -306,6 +306,8 @@ # changed favorites_sweeper -> favorites_sweeper # -- updates # updated AskJeeves to Ask +# 2012-06-05 Albrecht Mueller +# added Grabber from SDSC (San Diego Supercomputer Center). # to do MS Search 4.0 Robot @@ -313,7 +315,7 @@ # Robots list was found at http://www.robotstxt.org/wc/active/all.txt -# Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html +# Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html # Rem: To avoid bad detection, some robot's ids were removed from this list: # - Robots with ID of 3 letters only # - Robots called 'webs' and 'tcl' @@ -341,6 +343,7 @@ 'ferret', 'googlebot', 'google\-sitemaps', +'grabber', 'google[_+ ]web[_+ ]preview', 'gulliver', 'virus[_+ ]detector', # Must be before harvest @@ -378,13 +381,14 @@ 'webbase', 'webcollage', 'cfetch', -'zyborg', # Must be before wisenut +'zyborg', # Must be before wisenut 'wisenutbot' ); @RobotsSearchIDOrder_list2 = ( # Less common robots (In robot file) '[^a]fish', 'abcdatos', +'abonti\.com', 'acme\.spider', 'ahoythehomepagefinder', 'alkaline', @@ -475,6 +479,7 @@ 'iaskspider', 'hl_ftien_spider', 'sogou', +'icjobs\.de', 'iconoclast', 'ilse', 'imagelock', @@ -777,6 +782,7 @@ 'internetarchive', 'internetseer', 'internetsupervision', +'ips\-agent', 'irlbot', 'isearch2006', 'iupui_research_bot', @@ -799,6 +805,7 @@ 'letscrawl\.com', 'libcrawl', 'linkbot', +'linkdex\.com', 'link_valet_online', 'metager\-linkchecker', # Must be before linkchecker 'linkchecker', @@ -883,6 +890,7 @@ 'seznambot', 'shim\-crawler', 'shoutcast', +'siteexplorer\.info', 'slysearch', 'snap\.com_beta_crawler', 'sohu\-search', @@ -1009,7 +1017,7 @@ '^motorola$', 'movabletype', # These appear to be bots trying to hide. All of the usual architecture data is missing. -'^mozilla\/3\.0 \(compatible$', +'^mozilla\/3\.0 \(compatible$', '^mozilla\/4\.0$', '^mozilla\/4\.0 \(compatible;\)$', '^mozilla\/5\.0$', @@ -1113,6 +1121,7 @@ 'contentmatch','Yahoo!China ContentMatch Crawler', 'googlebot','Googlebot', 'google\-sitemaps', 'Google Sitemaps', +'grabber', 'Grabber (SDSC)', 'google[_+ ]web[_+ ]preview', 'Google Web Preview', 'gulliver','Northern Light Gulliver', 'virus[_+ ]detector','virus_detector', @@ -1154,6 +1163,7 @@ # Less common robots (In robot file) '[^a]fish','Fish search', 'abcdatos','ABCdatos BotLink', +'abonti\.com','Abonti WebSearch', 'acme\.spider','Acme.Spider', 'ahoythehomepagefinder','Ahoy! The Homepage Finder', 'alkaline','Alkaline', @@ -1218,7 +1228,7 @@ 'felix','Felix IDE', 'fetchrover','FetchRover', 'fido','fido', -'finnish','H���ki', +'finnish','Finnish', 'fireball','KIT-Fireball', 'fouineur','Fouineur', 'francoroute','Robot Francoroute', @@ -1244,6 +1254,11 @@ 'iaskspider','Sina Iask Spider', 'hl_ftien_spider','Hylanda', 'sogou','Sogou Spider', +'icjobs\.de', 'iCjobs Spider (only hits on robots.txt counted!)', +#The user agent string of the icjobs-spider seems to contain the +#identifying string only when it accesses the robots.txt file. +#When it accesses the actual content it does not identify itself as +#a spider. Thus traffic of this spider is counted as user traffic. 'iconoclast','Popular Iconoclast', 'ilse','Ingrid', 'imagelock','Imagelock', @@ -1254,6 +1269,7 @@ 'infospider','InfoSpiders', 'inspectorwww','Inspector Web', 'intelliagent','IntelliAgent', +'ips\-agent', 'ips-agent Verisign(?) - no reliable information found.', 'irobot','I, Robot', 'iron33','Iron33', 'israelisearch','Israeli-search', @@ -1431,7 +1447,7 @@ 'activebookmark','ActiveBookmark', 'adamm_bot','AdamM Bot', 'almaden','IBM Almaden Research Center WebFountain™', -'aipbot','aipbot', +'aipbot','aipbot', 'aleadsoftbot','ALeadSoftbot', 'alpha_search_agent','Alpha Search Agent', 'allrati','Allrati', @@ -1468,7 +1484,7 @@ 'cfnetwork','CFNetwork', 'cipinetbot','CipinetBot', 'checkweb_link_validator','CheckWeb link validator', -'commons\-httpclient','Jakarta commons-httpclient', +'commons\-httpclient','Jakarta commons-httpclient', 'computer_and_automation_research_institute_crawler','Computer and Automation Research Institute Crawler', 'converamultimediacrawler','ConveraMultiMediaCrawler', 'converacrawler','ConveraCrawler', @@ -1571,6 +1587,7 @@ 'libcrawl','Crawl libcrawl', 'link_valet_online','Link Valet Online', 'linkbot','LinkBot', +'linkdex\.com', 'Linkdex', 'linkchecker','LinkChecker', 'livejournal\.com', 'LiveJournal.com', 'magpierss', 'MagpieRSS', @@ -1652,10 +1669,11 @@ 'seznambot','SeznamBot', 'shim\-crawler','Shim-Crawler', 'shoutcast','Shoutcast Directory Service', +'siteexplorer\.info', 'Site Explorer', 'slysearch','SlySearch', 'snap\.com_beta_crawler','snap.com beta crawler', 'sohu\-search','sohu-search', -'sohu','sohu agent', +'sohu','sohu agent', 'snappy','Snappy', 'sphere_scout','Sphere Scout', 'spip','SPIP', @@ -1686,7 +1704,7 @@ 'unchaos_bot_hybrid_web_search_engine','UnChaos Bot Hybrid Web Search Engine', 'unido\-bot','unido-bot', 'updated','updated', -'ustc\-semantic\-group','USTC-Semantic-Group', +'ustc\-semantic\-group','USTC-Semantic-Group', 'vagabondo\-wap','Vagabondo-WAP', 'vagabondo','Vagabondo', 'vermut','Vermut', @@ -1753,98 +1771,98 @@ 'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot*\')', '[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'*bot\')', 'curl', 'Common *nix tool for automating web document retireval. Most likely a bot.', -'php', 'A PHP script', -'ruby\/', 'Ruby script', +'php', 'A PHP script', +'ruby\/', 'Ruby script', # Additional bots found by Sussex. '^[1-3]$', 'Generic bot identified as "1", "2" or "3"', 'alltop', 'alltop', -'applesyndication', 'applesyndication', -'asynchttpclient', 'asynchttpclient', -'bingbot', 'bingbot', -'blogged_crawl', 'blogged_crawl', +'applesyndication', 'applesyndication', +'asynchttpclient', 'asynchttpclient', +'bingbot', 'bingbot', +'blogged_crawl', 'blogged_crawl', 'bloglovin', 'bloglovin', 'butterfly', 'butterfly', -'buzztracker', 'buzztracker', +'buzztracker', 'buzztracker', 'carpathia', 'carpathia', -'catbot', 'catbot', -'chattertrap', 'chattertrap', -'check_http', 'check_http (nagios)', -'coldfusion', 'coldfusion', +'catbot', 'catbot', +'chattertrap', 'chattertrap', +'check_http', 'check_http (nagios)', +'coldfusion', 'coldfusion', 'covario', 'covario', -'daylifefeedfetcher', 'daylifefeedfetcher', -'discobot', 'discobot', +'daylifefeedfetcher', 'daylifefeedfetcher', +'discobot', 'discobot', 'dlvr\.it', 'dlvr.it', 'dreamwidth', 'dreamwidth', -'drupal', 'Drupal Site', +'drupal', 'Drupal Site', 'ezoom', 'ezoom', -'feedmyinbox', 'feedmyinbox', +'feedmyinbox', 'feedmyinbox', 'feedroll\.com', 'feedroll.com', 'feedzira', 'feedzira', -'fever\/', 'Feed a Fever', -'freenews', 'freenews', +'fever\/', 'Feed a Fever', +'freenews', 'freenews', 'geohasher', 'geohasher', -'hanrss', 'hanrss', -'inagist', 'inagist', -'jacobin club', 'jacobin club', +'hanrss', 'hanrss', +'inagist', 'inagist', +'jacobin club', 'jacobin club', 'jakarta', 'jakarta', -'js\-kit', 'js-kit', -'largesmall crawler', 'largesmall crawler', -'linkedinbot', 'linkedinbot', -'longurl', 'longurl', -'metauri', 'metauri', -'microsoft\-webdav\-miniredir', 'microsoft-webdav-miniredir', +'js\-kit', 'js-kit', +'largesmall crawler', 'largesmall crawler', +'linkedinbot', 'linkedinbot', +'longurl', 'longurl', +'metauri', 'metauri', +'microsoft\-webdav\-miniredir', 'microsoft-webdav-miniredir', '^motorola$', 'Suspected Bot masquerading as "Motorola"', -'movabletype', 'movabletype', -'^mozilla\/3\.0 \(compatible$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/4\.0$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/4\.0 \(compatible;\)$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 \(compatible;$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 \(en\-us\)$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 firefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', -'^msie', 'Suspected bot masquerading as M$ IE', +'movabletype', 'movabletype', +'^mozilla\/3\.0 \(compatible$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/4\.0$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/4\.0 \(compatible;\)$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0 \(compatible;$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0 \(en\-us\)$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0 firefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', +'^msie', 'Suspected bot masquerading as M$ IE', 'netnewswire', 'netnewswire', ' netseer ', 'Net Seer', -'netvibes', 'netvibes', +'netvibes', 'netvibes', 'newrelicpinger', 'newrelicpinger', 'newsfox', 'Fox News', -'nextgensearchbot', 'nextgensearchbot', -'ning', 'ning', -'pingdom', 'pingdom', -'pita', 'pita (pain in the ass?)', -'postpost', 'postpost', +'nextgensearchbot', 'nextgensearchbot', +'ning', 'ning', +'pingdom', 'pingdom', +'pita', 'pita (pain in the ass?)', +'postpost', 'postpost', 'postrank', 'postrank', -'printfulbot', 'printfulbot', -'protopage', 'protopage', -'proximic', 'proximic', -'quipply', 'quipply', +'printfulbot', 'printfulbot', +'protopage', 'protopage', +'proximic', 'proximic', +'quipply', 'quipply', 'r6\_', 'Radian 6 Crawler', -'ratingburner', 'ratingburner', -'regator', 'regator', -'rome client', 'rome client', -'rpt\-httpclient', 'rpt-httpclient', -'rssgraffiti', 'rssgraffiti', +'ratingburner', 'ratingburner', +'regator', 'regator', +'rome client', 'rome client', +'rpt\-httpclient', 'rpt-httpclient', +'rssgraffiti', 'rssgraffiti', 'sage\+\+', 'sage++', 'scoutjet', 'ScoutJet crawler for Blekko.', -'simplepie', 'simplepie', -'sitebot', 'sitebot', -'summify\.com', 'summify.com', -'superfeedr', 'superfeedr', -'synthesio', 'synthesio', +'simplepie', 'simplepie', +'sitebot', 'sitebot', +'summify\.com', 'summify.com', +'superfeedr', 'superfeedr', +'synthesio', 'synthesio', 'teoma', 'teoma', -'topblogsinfo', 'topblogsinfo', -'topix\.net', 'topix.net', -'trapit', 'trapit', -'trileet', 'trileet', +'topblogsinfo', 'topblogsinfo', +'topix\.net', 'topix.net', +'trapit', 'trapit', +'trileet', 'trileet', 'tweetedtimes', 'The Tweeted Times', -'twisted pagegetter', 'twisted pagegetter', -'twitterbot', 'twitterbot', -'twitterfeed', 'twitterfeed', -'unwindfetchor', 'unwindfetchor', -'wazzup', 'wazzup', +'twisted pagegetter', 'twisted pagegetter', +'twitterbot', 'twitterbot', +'twitterfeed', 'twitterfeed', +'unwindfetchor', 'unwindfetchor', +'wazzup', 'wazzup', 'windows\-rss\-platform', 'windows-rss-platform', 'wiumi', 'wiumi', -'xydo', 'xydo', +'xydo', 'xydo', 'yahoo! slurp', 'Additional Yahoo bots.', 'yahoo pipes', 'Additional Yahoo bots.', 'yahoo\-newscrawler', 'Additional Yahoo bots.', @@ -1852,13 +1870,13 @@ 'yahooexternalcache', 'Additional Yahoo bots.', 'yahoo! searchmonkey', 'Additional Yahoo bots.', 'yahooysmcm', 'Additional Yahoo bots.', -'yammer', 'yammer', -'yandexbot', 'yandexbot', -'yeti', 'yeti', +'yammer', 'yammer', +'yandexbot', 'yandexbot', +'yeti', 'yeti', 'yie8', 'yie8', -'youdao', 'youdao', -'yourls', 'yourls', -'zemanta', 'zemanta', +'youdao', 'youdao', +'yourls', 'yourls', +'zemanta', 'zemanta', 'zend_http_client', 'Zend Http Client', 'no_user_agent','Unknown robot (identified by empty user agent string)', # Unknown robots identified by hit on robots.txt @@ -1903,4 +1921,4 @@ 'yahooysmcm'=>'Yahoo' ); -1; +1; \ No newline at end of file diff --git a/wwwroot/cgi-bin/lib/search_engines.pm b/wwwroot/cgi-bin/lib/search_engines.pm index 52907ab9..9190ac0d 100644 --- a/wwwroot/cgi-bin/lib/search_engines.pm +++ b/wwwroot/cgi-bin/lib/search_engines.pm @@ -303,6 +303,14 @@ 'start\.iminent\.com', 'www\.searchmobileonline\.com', 'int\.search-results\.com', +'www2\.inbox\.com', +'www\.govome\.com', +'find1friend\.com', +'start\.mysearchdial\.com', +'go\.speedbit\.com', +'search\.certified-toolbar\.com', +'search\.sweetim\.com', +'search\.searchcompletion\.com', # Chello Portals 'chello\.at', 'chello\.be', @@ -390,6 +398,11 @@ 'suche\.aol\.de', 'www\.startxxl\.com', 'www\.benefind\.de', +'www\.amazon\.de.*search', #Just as a reminder, probably will not work as AWstats seem to consider the host part of an URL only +'de\.wow\.com', +'www\.vlips\.de', +'www\.metager\.de', +'search\.1und1\.de', # Minor Hungarian search engines 'heureka\.hu','vizsla\.origo\.hu','lapkereso\.hu','goliat\.hu','index\.hu','wahoo\.hu','webmania\.hu','search\.internetto\.hu', 'tango\.hu', @@ -541,6 +554,14 @@ 'start\.iminent\.com', 'iminent', 'www\.searchmobileonline\.com', 'searchmobileonline', 'int\.search-results\.com', 'nortonsavesearch', +'www2\.inbox\.com', 'inbox', +'www\.govome\.com', 'govome', +'find1friend\.com', 'find1friend', +'start\.mysearchdial\.com', 'mysearchdial', +'go\.speedbit\.com', 'speedbit', +'search\.certified-toolbar\.com', 'certifiedtoolbarsearch', +'search\.sweetim\.com', 'sweetim', +'search\.searchcompletion\.com', 'searchcompletion', # Chello Portals 'chello\.at','chelloat', 'chello\.be','chellobe', @@ -667,6 +688,11 @@ 'suche\.aol\.de', 'aolsuche', 'www\.startxxl\.com', 'startxxl', 'www\.benefind\.de', 'benefind', +'www\.amazon\.de.*search', 'amazonsearch', #Not clear if this matches amazon searches only +'de\.wow\.com', 'wowsearch', +'www\.vlips\.de', 'vlips_de', +'www\.metager\.de', 'metager', +'search\.1und1\.de', 'search_1und1_de', # Minor Hungarian search engines 'heureka\.hu','heureka', 'vizsla\.origo\.hu','origo', @@ -832,6 +858,14 @@ 'iminent', 'q=', 'searchmobileonline', 'q=', 'nortonsavesearch', 'q=', +'inbox', 'q(?:kw)?=', +'govome', 'q=', +'find1friend', 'q=', +'mysearchdial', 'q=', +'speedbit', 'q=', +'certifiedtoolbarsearch', 'q=', +'sweetim', 'q=', +'searchcompletion', 'q=', # Chello Portals 'chelloat','q1=', 'chellobe','q1=', @@ -921,6 +955,11 @@ 'aolsuche', 'q=', 'startxxl', 'q=', 'benefind', 'q=', +'amazonsearch', 'query=', +'wowsearch', 'q=', +'vlips_de', 'q=', +'metager', 'eingabe=', +'search_1und1_de', 'q=', # Minor Hungarian search engines 'heureka','heureka=', 'origo','(q|search)=', 'goliat','KERESES=', 'wahoo','q=', 'internetto','searchstr=', 'keresolap_hu','q=', @@ -1084,6 +1123,14 @@ 'iminent', 'Iminent', 'searchmobileonline', 'Search Mobile Online (StartApp)', 'nortonsavesearch', 'Norton Safe Search', +'inbox', 'Inbox Search', +'govome', 'Govome', +'find1friend', 'Find1Friend', +'mysearchdial', 'My Search Dial', +'speedbit', 'Speedbit', +'certifiedtoolbarsearch', 'Certified-Toolbar Search', +'sweetim', 'SweetIM Search', +'searchcompletion', 'SearchCompletion Search', # Chello Portals 'chelloat','Chello Austria', 'chellobe','Chello Belgium', @@ -1175,6 +1222,11 @@ 'aolsuche', 'AOL Suche', 'startxxl', 'StartXXL', 'benefind', 'benefind', +'amazonsearch', 'Amazon Web Search', +'wowsearch', 'Wow Search', +'vlips_de', 'vlips.de', +'metager', 'MetaGer', +'search_1und1_de', '1&1 Suche', # Minor hungarian search engines 'heureka','Heureka', 'origo','Origo-Vizsla', 'lapkereso','Startlapkereso', 'goliat','Goliat', 'indexhu','Index', 'wahoo','Wahoo', 'webmania','webmania.hu', 'internetto','Internetto Kereso', 'tango_hu','Tango', @@ -1264,4 +1316,4 @@ #} #print @SearchEnginesSearchIDOrder_list1." ".@SearchEnginesSearchIDOrder_list2." ".@SearchEnginesSearchIDOrder_listgen; -1; +1; \ No newline at end of file