From: Laurent Destailleur Date: Sun, 28 Jul 2024 13:42:06 +0000 (+0200) Subject: Update test files X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=48d18c2334b78ab9bfd77e1ebe590bb808d331e7;p=thirdparty%2FAWStats.git Update test files --- diff --git a/robots.pm b/robots.pm deleted file mode 100644 index 6290432a..00000000 --- a/robots.pm +++ /dev/null @@ -1,2786 +0,0 @@ -# AWSTATS ROBOTS DATABASE -#------------------------------------------------------- -# If you want to add robots to extend AWStats database detection capabilities, -# you must add an entry in RobotsSearchIDOrder_listx and RobotsHashIDLib. - -# The entry in RobotsSearchIDOrder_listx is a Perl regular expression -# (see http://perldoc.perl.org/perlreref.html). AWSTats applies these -# expressions to the user agent string in the order given by the lists. The -# first match specifies the robot. -# -# Note: This regular expression must not contain any whitespace. -# Otherwise AWStats will produce lines in the database that -# will be misinterpreted and as a consequence the corresponding data in the -# generated HTML reports will be wrong. If you want to match whitespace in -# the user agent string, use other constructs like '\s', '[:blank:]', -# '\p{IsSpace}', '\x20' etc. -# -# The corresponding entry in RobotsHashIDLib contains the regular expression -# as key, followed by a string containing HTML-text. AWStats inserts this -# text into reports to describe the bot. If possible the text should contain -# a link to the bot home page. This makes it easier for sysadmins to find -# the information necessary e.g. to adapt the robots.txt file. -# -# An entry in the RobotsAffiliateLib is not necessary. An entry in this list -# contains as first part the regular expression specifying the bot. The -# second part is a string that gives the Company or product managing the bot. -# This information is not used yet. -# -# There are several sorts of bots that AWStats is not able to detect and -# therefore a considerable amount of bot generated traffic counts -# as user traffic: -# -# a) A crawler that identifies itself in the referrer string, but not in -# the user agent string. An example is the crawler from semalt.semalt.com. -# -# b) Crawlers that correctly access robots.txt but identify themselves in -# in the user agent string only once or just a few times. Most of the -# time a user agent string ist used that does not contain hints that -# a bot is involved. An example is the iCjobs spider. -# msnbot-UDiscovery/2.0b seems to show this behaviour too. -# -# -# -#------------------------------------------------------- - -# 2023-07-04 RobC -# Removed Dalvik as native Android UI Browser User Agent -# Removed CFNetwork as native iOS and OSX Browser User Agent - -# 2021-05--05 RobC - -# Removed Baidu catchall because its picking up baidu.sogo.uc.UCBrowser which is a phone browser -# Added baiduspider- catchall instead - -# Newly added from 2021-05-05 -# Adsbot -# BW/ -# Bytespider -# CheckMarkNetwork/ -# DuckDuckBot -# # Foregenix Web Scan -# IonCrawl -# Linguee Bot -# Neevabot -# PetalBot -# TkBot -# vuhuvBot - - -# 2018-03-13 RobC -# Added 36 robots and one generic ( survey ) using v 7.7 robots file as base. -# Also moved robot "Obot" into generics so that it is singled out as an individual Robot. -# -# 2016-09-02 RobC -# Fixed a few errors and added a few missing bots from awstats 7.5 release. -# -# 2016-08-28 RobC -# Complete re-build of this file almost from scratch. -# dropped many old bots, added many new bots and reordered file. -# edited and added regex expressions to stop spaces causing problems. -# You should tune file by placing the most common robots crawling your site at top -# in List1. -# -# -# N.B. many bots need to be in correct order so don't chnage order without checking if -# change will cause counts to be allocated to wrong bot. Not always simple. -# -# -# 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html -# added dipsie (not tested with real data). -# added DomainsDB.net http://domainsdb.net/ -# added ia_archiver-web.archive.org (was inadvertently grouped with Alexa traffic) -# added Nutch (used by looksmart (furl?)) -# added rssImagesBot -# added Sqworm -# added t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e -# added w3c css-validator -# added documentation link to bot home pages for above and selected major bots. -# In the case of international bots, choose .com page. -# Included tool tip (html "title"). -# To do: parameterize to match both AWStats language and tooltips settings. -# To do: add html links for all bots based on current documentation in source -# files referenced below. -# changed '\wbot[\/\-]', to '\wbot[\/\-]' (removed comma) -# made minor grammar corrections to notes below -# 2005-08-24 added YahooSeeker-Testing -# added w3c-checklink -# updated url for ask.com -# 2005-08-24 added Girafabot http://www.girafa.com/ -# 2005-08-30 added PluckFeedCrawler http://www.pluck.com/ -# added Gaisbot/3.0 (robot05@gais.cs.ccu.edu.tw; ) -# dded geniebot (wgao@genieknows.com) -# added BecomeBot link http://www.become.com/site_owners.html -# added topicblogs http://www.topicblogs.com/ -# added Powermarks; seen used by referrer spam -# added YahooSeeker -# added NG/2. http://www.exabot.com/ -# 2005-09-15 added link for Walhello appie -# added bender focused_crawler -# updated YahooSeeker description (blog crawler) -# 2005-09-16 added link for http://linkchecker.sourceforge.net -# added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl) -# added Blogslive info@blogslive.com intelliseek.com -# added BlogPulse (ISSpider-3.0) intelliseek.com -# 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html) -# added EverbeeCrawler -# added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html -# added link for Bloglines http://www.bloglines.com -# 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html) -# added Blogshares Spiders (Synchronized V1.5.1) -# added yacy -# 2005-11-21 added Argus www.simpy.com -# added BlogsSay :: RSS Search Crawler (http://www.blogssay.com/) -# added MJ12bot http://majestic12.co.uk/bot.php -# added OpenTaggerBot (http://www.opentagger.com/opentaggerbot.htm) -# added OutfoxBot/0.3 (For internet experiments; outfox.agent@gmail.com) -# added RufusBot Rufus Web Miner http://64.124.122.252.webaroo.com/feedback.html -# added Seekbot (http://www.seekbot.net/bot.html) -# added Yahoo-MMCrawler/3.x (mms-mmcrawler-support@yahoo-inc.com) -# added link for BaiDuSpider -# added link for Blogshares Spider -# added link for StackRambler http://www.rambler.ru/doc/faq.shtml -# added link for WISENutbot -# added link for ZyBorg/1.0 (wn-14.zyborg@looksmart.net; http://www.WISEnutbot.com. Moved location to above wisenut to avoid classification as wisenut -# 2005-12-15 -# added FAST Enteprise Crawler/6 (www dot fastsearch dot com). Note spelling Enteprise not Enterprise. -# added findlinks http://wortschatz.uni-leipzig.de/findlinks/ -# added IBM Almaden Research Center WebFountain™ http://www.almaden.ibm.com/cs/crawler [hc3] -# added INFOMINE/8.0 VLCrawler (http://infomine.ucr.edu/useragents) -# added lmspider (lmspider@scansoft.com) http://www.nuance.com/ -# added noxtrumbot http://www.noxtrum.com/ -# added SandCrawler (Microsoft) -# added SBIder http://www.sitesell.com/sbider.html -# added SeznamBot http://fulltext.seznam.cz/ -# added sohu-search http://corp.sohu.com/ (looked for //robots.txt not /robots.txt) -# added the ruffle SemanticWeb crawler v0.5 - http://www.unreach.net -# added WebVulnCrawl/1.0 libwww-perl/5.803 (looked for //robots.txt not /robots.txt) -# added Yahoo! Japan keyoshid http://www.yahoo.co.jp/ -# added Y!J http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html -# added link for GigaBot -# added link for MagpieRSS -# added link for MSIECrawler -# 2005-12-21 -# added aipbot http://www.aipbot.com aipbot@aipbot.com [matthys70 users.sourceforge.net] -# added Everest-Vulcan Inc./0.1 (R&D project; http://everest.vulcan.com/crawlerhelp) -# added Fast-Search-Engine http://www.fast-search-engine.com/ [matthys70 users.sourceforge.net] -# added g2Crawler (nobody@airmail.net) http://crawler.instantnetworks.net/ -# added Jakarta commons-httpclient http://jakarta.apache.org/commons/httpclient/ (hit robots.txt). May be used as robot or browser - a site may want to remove this entry. -# added OmniExplorer_Bot http://www.omni-explorer.com/ [matthys70 users.sourceforge.net] -# added USTC-Semantic-Group ai.ustc.edu.cn/mas/en/research/index.php ? -# 2005-12-22 -# added EARTHCOM.info www.earthcom.info -# added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor] -# added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor] -# 2006-01-01 -# added Dulance http://www.dulance.com/bot.jsp -# added MojeekBot http://www.mojeek.com/bot.html -# added nicebot http://www.egghelp.org/setup.htm ? -# added Snappy http://www.urltrends.com/faq.php -# added sohu agent -# added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net] -# added zspider http://feedback.redkolibri.com/ -# 2006-01-13 -# added boitho.com-dc http://www.boitho.com/dcbot.html -# added IRLbot http://irl.cs.tamu.edu/crawler -# added virus_detector virus_harvester@securecomputing.com -# added Wavefire http://www.wavefire.com; info@wavefire.com - -# added WebFilter Robot -# 2006-01-24 -# added Shim-Crawler http://www.logos.ic.i.u-tokyo.ac.jp/crawler/; crawl@logos.ic.i.u-tokyo.ac.jp -# added Exabot exabot.com -# added LetsCrawl.com http://letscrawl.com -# added ichiro http://help.goo.ne.jp/door/crawlerE.html -# 2006-01-27 additional 22 robots from a list provided by Moizes Gabor -# added ALeadSoftbot http://www.aleadsoft.com/bot.htm -# added CipinetBot http://www.cipinet.com/bot.html -# added Cuasarbot http://www.cuasar.com/ -# added Dumbot http://www.dumbfind.com/ -# added Extreme_Picture_Finder http://www.exisoftware.com/ -# added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots -# added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html bot@iltrovatore.it -# added InsurancoBot http://www.fastspywareremoval.com/ -# added InternetArchive http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org -# added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca -# added Kurzor http://www.easymail.hu/ cursor@easymail.hu -# added NutchCVS http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org -# added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org -# added Orbiter http://www.dailyorbit.com/bot.htm -# added PHP_version_tracker http://www.nexen.net/phpversion/bot.php -# added SuperBot http://www.sparkleware.com/superbot/ -# added SynooBot http://www.synoo.de/bot.html webmaster@synoo.com -# added TestBot http://www.agbrain.com/ -# added TutorGigBot http://www.tutorgig.info/ -# added WebIndexer mailto://webindexerv1@yahoo.com -# added WebMiner http://64.124.122.252/feedback.html -# 2006-02-01 -# added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202 -# added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164 -# additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ] -# added Candlelight_Favorites_Inspector -# added DomainChecker -# added EasyDL -# added FavOrg -# added Favorites_Sweeper -# added Html_Link_Validator -# added Internet_Ninja -# added JRTwine_Software_Check_Favorites_Utility -# fixed Microsoft_URL_Control -# added miniRank -# added Missigua_Locator -# added NPBot -# added Ocelli -# added Onet.pl_SA -# added proodleBot -# added SearchGuild_DMOZ_Experiment -# added Susie -# added Website_Monitoring_Bot -# added Xenu_Link_Sleuth -# 2006-05-15 -# added ASPseek http://www.aspseek.org/ -# added AdamM Bot http://home.blic.net/adamm/ -# added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html -# added arianna.libero.it (Italian Portal/search engine) -# added Biz360 spider http://www.biz360.com -# added BlogBridge Service http://www.blogbridge.com/ -# added BlogSearch http://www.icerocket.com/ -# added libcrawl -# added edgeio-relanshanbottriever http://www.edgeio.com -# added FeedFlow http://feedflow.com/about -# added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt -# added Java catchall - used by many spam bots -# added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb -# added msnbot-media http://search.msn.com/msnbot.htm -# added MT::Telegraph::Agent -# added Netluchs http://www.netluchs.de/ (German SE bot) -# added oBot http://www.webmasterworld.com/forum11/1616.htm -# added Onfolio http://www.onfolio.com/ (IE Toolbar plugin) - hit rss feeds. -# added ping.blo.gs http://blo.gs/ping.php blog bot -# added Sphere Scout http://www.sphere.com/ -# added sproose crawler http://www.sproose.com/bot.html -# added SyndicAPI http://syndicapi.com/bot.html -# added Yahoo! Mindset http://mindset.research.yahoo.com/ -# added msrabot -# added Vagabondo & Vagabondo-WAP http://www.wise-guys.nl/Contact/index.php?botselected=webagents)#=uk -# fixed Missigua Locator detection (Missigua_Locator -> Missigua Locator) -# changed echo to echo! to avoid conflict with the bonecho (Firefox 2.0) browser. -# This requires you to reprocess historic logs if you want EchO! to be recognized for older reports. -# 2006-05-17 -# added Alpha Search Agent # 62.152.125.60 Eurologon Srl -# added Krugle http://www.krugle.com/crawler/info.html the search engine for developers -# added Octora Beta Bot http://www.octora.com/ # Blog and Rss Search Engine -# added UbiCrawler http://law.dsi.unimi.it/ubicrawler/ -# added Yahoo! Slurp China http://misc.yahoo.com.cn/help.html -# You must reprocess old logs for the Yahoo! Slurp China bot to be detected in old reports -# 2006-05-20 -# added 1-More Scanner http://www.myzips.com/software/1-More-Scanner.phtml -# added Accoona-AI-Agent http://www.accoona.com/ -# added ActiveBookmark http://www.libmaster.com/active_bookmark.php -# added BIGLOTRON http://www.biglotron.com/robot.html -# added Bookmark-Manager http://bkm.sourceforge.net/ -# added cbn00glebot -# added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240 -# added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork -# added CheckWeb link validator http://p.duby.free.fr/chkweb.htm -# added Computer and Automation Research Institute Crawler http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html -# added ConveraCrawler http://www.authoritativeweb.com/crawl/ -# added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/ -# added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php -# added Cursor http://adcenter.hu/docs/en/bot.html -# added Custo http://www.netwu.com/custo/ -# added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ -# added Deepindex http://www.deepindex.net/faq.php -# added DNSGroup http://www.dnsgroup.com/ -# added DoCoMo http://www.nttdocomo.co.jp/ -# added dumm.de-Bot http://www.dumm.de/ -# added ETS v http://www.freetranslation.com/help/ -# added eventax http://www.eventax.de/ -# added FAST Enterprise Crawler * crawleradmin.t-info@telekom.de http://www.telekom.de/ -# added FAST Enterprise Crawler http://www.fast.no/ -# added FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de http://www.telekom.de/ -# added FeedValidator http://feedvalidator.org/ -# added FilmkameraBot http://www.filmkamera.at/bot.html -# added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece -# added Global Fetch http://www.wesonet.com/ -# added GOFORITBOT http://www.goforit.com/about/ -# added GoForIt.com http://www.goforit.com/about/ -# added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php -# added HooWWWer http://cosco.hiit.fi/search/hoowwwer/ -# added HPPrint -# added HTMLParser http://htmlparser.sourceforge.net/ -# added Hundesuche.com-Bot http://www.hundesuche.com/ -# added InfoBot http://www.infobot.org/ -# added InfociousBot http://corp.infocious.com/tech_crawler.php -# added InternetSupervision http://internetsupervision.com/ -# added isearch2006 http://www.yahoo.com.cn/ -# added IUPUI_Research_Bot http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/ -# added KalamBot http://64.124.122.251/feedback.html -# added kamano.de NewsFeedVerzeichnis http://www.kamano.de/ -# added Kevin http://dznet.com/kevin/ -# added KnowItAll http://www.cs.washington.edu/research/knowitall/ -# added Knowledge.com http://www.knowledge.com/ -# added Kouaa Krawler http://www.kouaa.com/ -# added ksibot http://ego.ms.mff.cuni.cz/ -# added Link Valet Online http://www.htmlhelp.com/tools/valet/ -# added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request -# added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm -# added MapoftheInternet.com http://MapoftheInternet.com/ -# added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/ -# added Megite http://www.megite.com/ -# added Metaspinner http://index.meta-spinner.de/ -# added Mini-reptile -# added Misterbot http://www.misterbot.fr/ -# added Miva http://www.miva.com/ -# added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b -# added MSRBOT http://research.microsoft.com/research/sv/msrbot/ -# added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022 -# added Mydoyouhike http://www.doyouhike.net/my -# added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b -# added NetSprint http://www.netsprint.pl/serwis/ -# added NimbleCrawler http://www.healthline.com/ -# added OpenWebSpider http://www.openwebspider.org/ -# added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html -# added OSSProxy http://www.marketscore.com/FAQ.Aspx -# added passwordmaker.org http://passwordmaker.org/ -# added PEAR HTTP Request class http://pear.php.net/ -# added PEERbot http://www.peerbot.com/ -# added PHP version tracker http://www.nexen.net/phpversion/bot.php -# added PictureOfInternet http://malfunction.org/poi/ -# added plinki http://www.plinki.com/ -# added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b -# added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b -# added ProjectWF-java-test-crawler -# added PyQuery http://sourceforge.net/projects/pyquery/ -# added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ -# added Scumbot -# added Sensis Web Crawler http://www.sensis.com.au/ -# added snap.com beta crawler http://www.snap.com/ -# added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ -# added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm -# added Suchfin-Bot http://www.suchfin.de/ -# added Sunrise http://www.sunrisexp.com/ -# added Tagyu Agent http://www.tagyu.com/ -# added Tcl http client package http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm -# added TeragramCrawlerSURF http://www.teragram.com/ -# added Test Crawler http://netp.ath.cx/ -# added UnChaos Bot Hybrid Web Search Engine http://www.unchaos.com/ -# added unido-bot http://www.unchina.org/unido/unido/our_projects/3_3.html -# added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com) -# added updated http://www.updated.com/ -# added Vermut http://vermut.aol.com -# added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html -# added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb -# added VSE http://www.vivisimo.com/ -# added webcrawl.net http://www.webcrawl.net/ -# added Web Downloader http://www.krasu.ru/soft/chuchelo/ -# added Webdup http://www.webdup.com/en/index.html -# added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b -# added WordPress http://wordpress.org/ -# added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ -# added Xenu's Link Sleuth (with ') -# added xirq http://www.xirq.com/ -# added yoogliFetchAgent http://www.yoogli.com/ -# added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ -# -- fix - some robots were reported with _ where _ should have been a space. -# changed Xenu Link Sleuth -# changed microsoft[_+\s]url[_+\s]control -> microsoft_url_control -# changed favorites_sweeper -> favorites_sweeper -# -- updates -# updated AskJeeves to Ask -# 2012-06-05 Albrecht Mueller -# added Grabber from SDSC (San Diego Supercomputer Center). -# 2013-09-30 Albrecht Mueller -# AWStats probably cannot detect this bot as it identifies itself in -# the referrer field and not in the user agent string. -#92.113.100.35 - - [29/Sep/2013:17:22:46 +0200] "GET /robots.txt HTTP/1.1" 200 516 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" -#92.113.100.35 - - [29/Sep/2013:17:22:49 +0200] "GET /tghome.htm HTTP/1.1" 200 4445 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" -#92.113.100.35 - - [29/Sep/2013:17:22:51 +0200] "GET / HTTP/1.1" 200 5467 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" - -# to do MS Search 4.0 Robot - -#package AWSROB; - - -# Robots list was found at http://www.robotstxt.org/wc/active/all.txt -# Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html -# Rem: To avoid bad detection, some robot's ids were removed from this list: -# - Robots with ID of 3 letters only -# - Robots called 'webs' and 'tcl' -# Rem: directhit changed into direct_hit (its real id) -# Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser -# Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser -# Rem: roadrunner changed into road_runner -# Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser -# Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser - -# RobotsSearchIDOrder -# It contains all matching criteria to search for in log fields. This list is -# used to know in which order to search Robot IDs. -# Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more -# Minor robots are in list2, used when LevelForRobotsDetection is 2 or more -# Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+\s]' and are quoted. -#------------------------------------------------------- - - -@RobotsSearchIDOrder_list1 = ( -# Common robots (In robot file) -'bingbot/', -'bingpreview', -'MSIECrawler', -'msnbot/', -'msnbot\-media/', -'AdIdxBot/', -'NOT[\x20]Googlebot/', -'Googlebot/', -'Google[\x20]Web[\x20]Preview', -'Googlebot\-Image/', -'Googlebot\-Mobile/', -'Google[\x20]Page[\x20]Speed', -'google\-sitemaps', -'Googlebot\-News', -'Googlebot\-Video/', -'AdsBot\-Google[\x20]\(', -'AdsBot\-Google\-Mobile\-Apps', -'Adsbot', -'Mediapartners-Google', -'Feedfetcher\-Google', -'Google\-Adwords\-Instant', -'Firefox/1\.5', -'Yahoo![\x20]Slurp[\x20]China', -'Yahoo![\x20]Slurp', -'Baiduspider/', -'Baiduspider\-image', -'Baiduspider-', -'YandexBot/', -'YandexImages/', -'YandexImageResizer', -'YandexMetrika/', -'YandexMobileBot/', -'yandex', -'electricmonk/', -'spbot/', -'SeznamBot/', -'msie8', -'AhrefsBot/', -'007ac9[\x20]Crawler', -'2345Explorer/', -'360Spider', -'A[\x20]Simple[\x20]Crawler', -'Abrave', -'acapbot/', -'Accoona\-AI\-Agent/', -'arcemedia', -'AdnormCrawlerCatchBot/', -'adscanner', -'aiHitBot/', -'aipbot/', -'AlphaBot', -'Apache\-HttpClient/', -'Apexoo[\x20]Spider', -'Applebot/', -'archive\.org_bot', -'Babya[\x20]Discoverer', -'Barkrowler', -'BDCbot/', -'BellPagesCA/', -'BeNosey[\x20]Mohawk[\x20]Search', -'bhcBot', -'bidswitchbot', -'BigBozz/', -'BinGet/', -'bitlybot', -'bl\.uk_lddc_bot/', -'BLEXBot/', -'bnf.fr_bot', -'boitho\.com\-dc/', -'BoogleBot', -'BusinessBot:', -'BW/', -'Bytespider', -'CatchBot/', -'CB/Nutch', -'CCBot/', -'CheckMarkNetwork/', -'Cliqzbot/', -'CMS[\x20]Crawler', -'Companybook\-Crawler', -'ConveraCrawler/', -'Contacts-Crawler', -'contxbot', -'cosmos/', -'crawl/Nutch', -'crawler4j', -'CRAZYWEBCRAWLER', -'CRMNLCrawlAgent', -'CSE[\x20]HTML[\x20]Validator', -'C\-T[\x20]bot', -'CUBOT', -'Curl/PHP', -'cyencebot', -'DataCrawler/', -'daumoa', -'daum', -'Deepnet[\x20]Explorer', -'DeuSu/', -'Digincore', -'Discordbot/', -'Dispatch/', -'DnyzBot', -'DoCoMo/', -'Domain[\x20]Re\-Animator[\x20]Bot', -'DomainCrawler/', -'DomainMacroCrawler/', -'DomainSONOCrawler/', -'DomainStatsBot/', -'DotBot/', -'DuckDuckBot-Https', -'DuckDuckBot', -'DuckDuckGo\-Favicons\-Bot/', -'ELinks/', -'ELinks[\x20]\(', -'EmailMarketingRobot/', -'EmeraldShield\.com[\x20]WebBot', -'envolk\[ITS\]spider/', -'eright', -'EsperanzaBot', -'Exabot/', -'ExtLinksBot', -'ExperianCrawlUK', -'facebookexternalhit/', -'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de', -'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', -'FAST\-WebCrawler/', -'Feosey[\x20]Mohk[\x20]Crawler', -'findlinks/', -'Findxbot/', -'FirePHP/', -'firstdirectory\-bot', -'flamingo', -'FlippyBearBot/', -'^foo$', -'Foregenix[\x20]Web[\x20]Scan', -'FreeWebMonitoring[\x20]SiteChecker/', -'fujilabol', -'FurlBot/', -'Gaisbot/', -'Gallent[\x20]Spider', -'GarlikCrawler/', -'Getintent[\x20]Crawler', -'GetintentCrawler[\x20]getintent\.com', -'Gigabot/', -'gipo\-crawler/Nutch', -'Girafabot', -'Gluten[\x20]Free[\x20]Crawler/', -'gocrawl', -'Gowikibot', -'Go\-http\-client/', -'GrapeshotCrawler/', -'GSiteCrawler/', -'GurujiBot/', -'hadiBot', -'HaosouSpider', -'HELLO[\x20]Crawler', -'holmes/', -'houzzbot', -'HTTP_Request2/', -'HubSpot[\x20]Webcrawler', -'HyperCrawl/', -'ICC\-Crawler/', -'iconoclast', -'IDGCrawler/Nutch', -'IDG/UK', -'idmarch[\x20]Automatic\.beta/', -'InbyBot', -'Incutio[\x20]XML', -'IndeedBot', -'InfluenceBot', -'IonCrawl', -'IRLbot/', -'IssueCrawler', -'istellabot/', -'James[\x20]BOT', -'Jigsaw/', -'JobFeed', -'Jooblebot', -'KomodiaBot/', -'Konqueror/', -'laserlikebot', -'Lightspeed', -'linkapediabot', -'metager\-linkchecker', -'Linguee[\x20]Bot', -'linkchecker', -'LinkCheck', -'linkdexbot/', -'LinkedInBot/', -'LinkpadBot/', -'Links[\x20]\(', -'LinksManager\.com_bot', -'LWP::Simple/', -'Mail\.RU_Bot/', -'makecontact', -'mappy', -'MauiBot', -'meanpathbot/', -'Mechanize', -'Mediatoolkitbot', -'MegaIndex\.ru/', -'merzscope', -'Meta_Bot', -'mfibot/', -'microsoft.*discovery', -'missigua_locator', -'MixrankBot', -'MJ12bot/', -'MojeekBot', -'Mojolicious', -'MXT/Nutch', -'My[\x20]Nutch[\x20]Spider/', -'myse/Nutch', -'Naaraa', -'Neevabot', -'NerdyBot', -'netEstate[\x20]NE[\x20]Crawler', -'NetResearchServer/', -'Nimbostratus-Bot', -'nominet', -'NRLCorpusBuilder/Nutch', -'nutch\-1\.4/', -'nutch\-1\.8/', -'NutchCVS/', -'o\.uk[\x20]robot', -'ocrawler;', -'ODP[\x20]link[\x20]checker', -'Offline[\x20]Explorer/', -'OmniExplorer_Bot/', -'OrangeBot/', -'Orliac', -'OutclicksBot', -'PageBitesHyperBot/', -'Pcore', -'pdffillerbot/', -'peopleman', -'PetalBot', -'PhantomJS', -'PHP/5\.2\.8', -'Pinterestbot', -'PiplBot', -'Ploetz[\x20]\+[\x20]Zeller', -'Plukkie/', -'Princetonbot/', -'PrivacyAwareBot/', -'Prlog/', -'proximic', -'psbot/', -'psbot\-image', -'python_wk_crawler', -'Python\-urllib/', -'QCrawl', -'Quick-Crawler', -'ResearchBot', -'roboto', -'rogerbot/', -'RSSingBot', -'RukiCrawler/', -'SafeDNS[\x20]search[\x20]bot/', -'SafeDNSBot', -'SafeSearch[\x20]microdata[\x20]crawler', -'safesearch', -'SBL\-BOT', -'scrapy', -'Screaming[\x20]Frog[\x20]SEO[\x20]Spider/', -'ScreenerBot[\x20]Crawler[\x20]Beta', -'Scrubby', -'Searchie/', -'SecurityResearch\.bot', -'Seekmo', -'semanticbot', -'SemrushBot/', -'SemrushBot-SI', -'seo\-audit\-check\-bot/', -'Seobility', -'SEOkicks\-Robot', -'SEOlyticsCrawler/', -'SEOstats', -'Seosys/Nutch', -'Seoterritory\.com[\x20]bot', -'serendeputy', -'Shim\-Crawler', -'SiteExplorer/', -'siteexplorer\.info', -'siteimprove', -'Slackbot\-LinkExpanding', -'SmabblerBot/', -'Sogou[\x20]web[\x20]spider/', -'special_archiver/', -'Spiderbot/', -'SpuhexBot', -'spyonweb', -'ssearch_bot', -'Streamline3Bot', -'SurdotlyBot/', -'SurveyBot/', -'taiil/Nutch', -'tbot\-nutch', -'TeeRaidBot', -'TelegramBot', -'Test/Nutch', -'Test[\x20]Spider', -'TestCrawler', -'The[\x20]Knowledge[\x20]AI', -'TkBot', -'tracemyfile', -'trendiction', -'TurnitinBot/', -'TurnitinBot', -'TweetmemeBot/', -'UCY/Nutch', -'uni-leipzig\.de', -'Uptimebot/', -'UptimeRobot/', -'URL[\x20]Checker', -'UXCrawlerBot', -'Validator\.nu/', -'vBSEO', -'vBulletin[\x20]via[\x20]PHP', -'vebidoobot', -'vegi[\x20]bot', -'Velen', -'viz/Nutch', -'VoilaBot', -'VORTEX/', -'voyager/', -'vuhuvBot', -'W3C_Validator/', -'W3C\-checklink/', -'WBSearchBot/', -'WbSrch/', -'WeSEE:Ads/PageBot', -'WeSEE:Ads/PictureBot', -'WeSEE_Bot', -'Wget/', -'Who\.is[\x20]Bot', -'wonderbot/', -'woobot/', -'Wotbox/', -'Xaldon[\x20]WebSpider', -'Xenu[\x20]Link[\x20]Sleuth', -'xenu_link_sleuth', -'XML[\x20]Sitemaps[\x20]Generator', -'XoviBot/', -'yacybot', -'Yahoo[\x20]Link[\x20]Preview', -'yak', -'YisouSpider', -'yoozBot', -'Your\-Website\-Sucks', -'zoominfobot', -'zspider/', -'ZumBot/', -# below placed at end to catch some generics -'ng/1\.', -'ng/2\.', -'libwww\-perl', -'urllib', -'javabee', -'projectwf\-java\-test\-crawler', -'java', -'loocalcrawler/nutch', -'nutchosu\-vlib', -'nutch', -'perlcrawler', -'perl', -# old robots using firefox < version 11 not identifying themselves as a robot. -'(firefox/)([0-9]\.|[0-1][0]\.)' -); - -@RobotsSearchIDOrder_list2 = ( -# Less common robots (In robot file) -'^Mozilla$', -'^mozilla\/3\.0\s\(compatible$', -'^mozilla\/4\.0$', -'^mozilla\/4\.0\s\(compatible;\)$', -'^mozilla\/5\.0$', -'^mozilla\/5\.0\s\(compatible;$', -'^mozilla\/5\.0\s\(en\-us\)$', -'^mozilla\/5\.0\sfirefox\/3\.0\.5$', -'^Mozilla/6\.0[\x20]\(compatible\)$', -'^Mozilla/(.*)Beta[\x20]\(Windows\)', -'MSIE[\x20]2', -'MSIE[\x20]3', -'MSIE[\x20]4', -'MSIE[\x20]5', -'MSIE[\x20]6', -'MSIE\+6\.0\;', -'Windows[\x20]95', -'Windows[\x20]98', - -# these could be removed to speed up processing as they are rarely seen -'a6\-indexer', -'abcdatos', -'abonti\.com', -'acme\.spider', -'activebookmark', -'adamm_bot', -'advbot', -'affectv\.co\.uk', -'ahoythehomepagefinder', -'aleadsoftbot', -'alkaline', -'allrati', -'alltop', -'almaden', -'alpha_search_agent', -'anthill', -'antibot', -'aport', -'appie', -'applesyndication', -'arachnophilia', -'arale', -'araneo', -'architext', -'archive\-de\.com', -'aretha', -'argus', -'ariadne', -'arianna\.libero\.it', -'arks', -'aspider', -'aspseek', -'asterias', -'asynchttpclient', -'atn\.txt', -'atomz', -'auresys', -'awbot', -'backlinktest\.com', -'backrub', -'bbot', -'becomebot', -'bender', -'betabot', -'bigbrother', -'biglotron', -'BingLocalSearch', -'bittorrent_bot', -'biz360[_+\s]spider', -'bjaaland', -'blackwidow', -'blindekuh', -'blogbridge[_+\s]service', -'blogged_crawl', -'bloglines', -'bloglovin', -'blogpulse', -'blogsearch', -'blogshares', -'blogslive', -'blogssay', -'bloodhound', -'bncf\.firenze\.sbn\.it/raccolta\.txt', -'bobby', -'bookmark\-manager', -'borg\-bot', -'boris', -'brightnet', -'bruinbot', -'bspider', -'bubing', -'bumblebee', -'butterfly', -'buzztracker', -'cactvschemistryspider', -'calif[^r]', -'candlelight[_+\s]favorites[_+\s]inspector', -'careerbot', -'carpathia', -'cassandra', -'catbot', -'cbn00glebot', -'cerberian_drtrs', -'cfetch', -'cgireader', -'chattertrap', -'check_http', -'checkbot', -'checkweb_link_validator', -'christcrawler', -'churl', -'cienciaficcion', -'cipinetbot', -'imagecoccoc', -'coccoc', -'coldfusion', -'collective', -'combine', -'commons\-httpclient', -'computer_and_automation_research_institute_crawler', -'conceptbot', -'contentmatch', -'converamultimediacrawler', -'coolbot', -'copubbot', -'core', -'covario', -'cruiser', -'cscrawler', -'cuasarbot', -'cursor', -'cusco', -'custo', -'cyberspyder', -'datafountains/dmoz_downloader', -'dataprovider\.com', -'daviesbot', -'daylifefeedfetcher', -'daypopbot', -'deepindex', -'desertrealm', -'deweb', -'dienstspider', -'digger', -'digout4u', -'diibot', -'dipsie\.bot', -'direct_hit', -'discobot', -'dlvr\.it', -'dnabot', -'dnsgroup', -'doccheckbot', -'domainappender', -'domainchecker', -'domainsdb\.net', -'download_express', -'dragonbot', -'dreamwidth', -'drupal', -'dulance', -'dumbot', -'dumm\.de\-bot', -'dwcp', -'e\-collector', -'earthcom\.info', -'easydl', -'ebiness', -'eccp', -'echo!', -'edgeio\-retriever', -'elfinbot', -'emacs', -'emcspider', -'enteprise', -'ernst[:blank:]2\.0', -'esther', -'ets_v', -'eventax', -'everbeecrawler', -'everest\-vulcan', -'evliyacelebi', -'exactseek', -'extreme[_+\s]picture[_+\s]finder', -'ezoom', -'ezresult', -'facebook', -'facebot', -'fast\-search\-engine', -'matrix_s\.p\.a\._\-_fast_enterprise_crawler', -'fast_enterprise_crawler', -'fastbot', -'fastcrawler', -'favicon', -'favorg', -'favorites_sweeper', -'fdse', -'feedburner', -'feedcrawl', -'feedflow', -'feedmyinbox', -'feedroll\.com', -'feedsky', -'feedster', -'feedvalidator', -'feedzira', -'felix', -'ferret', -'fetchbot', -'fetchrover', -'fever/', -'fido', -'filmkamerabot', -'filterdb\.iss\.net', -'finderlein[_+\s]research[_+\s]crawler', -'findexa_crawler', -'finnish', -'fireball', -'firmilybot', -'flexum', -'foaf\-search\.net', -'fooky\.com/ScorpionBot', -'fouineur', -'francoroute', -'freecrawl', -'freenews', -'funnelweb', -'g2crawler', -'gama', -'gazz', -'gcreep', -'geniebot', -'genieo', -'geohasher', -'getbot', -'geturl', -'gigablastopensource', -'global_fetch', -'gnodspider', -'goforit\.com', -'goforitbot', -'golem', -'gonzo', -'gougou', -'gpu_p2p_crawler', -'grabber', -'grapeshot', -'grapnel', -'griffon', -'gromit', -'grub', -'gulliver', -'gulperbot', -'hambot', -'hanrss', -'harvest', -'havindex', -'henrythemiragorobot', -'heritrix', -'hl_ftien_spider', -'hometown', -'hoowwwer', -'hpprint', -'htdig', -'html[_+\s]link[_+\s]validator', -'htmlgobble', -'htmlparser', -'httrack', -'hundesuche\.com\-bot', -'hyperdecontextualizer', -'ia_archiver\-web\.archive\.org', -'ia_archiver', -'iajabot', -'iaskspider', -'i\-bot', -'icarus6j', -'ichiro', -'icjobs\.de', -'ilse', -'iltrovatore\-setaccio', -'imagelock', -'implisensebot', -'inagist', -'incywincy', -'infobot', -'infociousbot', -'infohelfer', -'infomine', -'informant', -'infoseeksidewinder', -'infoseek', -'infospider', -'inspectorwww', -'insurancobot', -'integromedb\.org', -'intelliagent', -'internet[_+\s]ninja', -'internetarchive', -'internetseer', -'internetsupervision', -'ips\-agent', -'irobot', -'iron33', -'isearch2006', -'israelisearch', -'iupui_research_bot', -'izsearch', -'jacobin[\x20]club', -'jakarta', -'jbot', -'jcrawler', -'jeeves', -'jennybot', -'jobboerse', -'jobot', -'jobo', -'joebot', -'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility', -'js\-kit', -'jubii', -'jumpstation', -'justview', -'kalambot', -'kamano\.de_newsfeedverzeichnis', -'kapsi', -'katipo', -'kazoombot', -'kevin', -'keyoshid', -'kilroy', -'kinja\-imagebot', -'kinjabot', -'knowitall', -'knowledge\.com', -'ko[_+\s]yappo[_+\s]robot', -'kouaa_krawler', -'krugle', -'ksibot', -'kummhttp', -'kurzor', -'labelgrabber\.txt', -'lanshanbot', -'larbin', -'largesmall[\x20]crawler', -'legs', -'letscrawl\.com', -'libcrawl', -'lilina', -'link_valet_online', -'linkbot', -'linkdex\.com', -'linkidator', -'linkscan', -'linkstats[\x20]bot', -'linkwalker', -'lipperhey', -'livejournal\.com', -'lmspider', -'loadtimebot', -'lockon', -'logo_gif', -'longurl', -'lssrocketcrawler', -'ltbot', -'ltx71', -'lwp\-request', -'lwp\-trivial', -'lycos[_+\s]', -'macworm', -'madaali\.de', -'magpierss', -'magpie', -'mapoftheinternet\.com', -'marvin', -'mattie', -'mediabot', -'mediafox', -'megaindex', -'megite', -'memorybot', -'mercator', -'meshexplorer', -'metager2\-verification\-bot', -'metajobbot', -'metaspinner', -'metauri', -'miadev', -'microsoft[_+\s]url[_+\s]control', -'microsoft[\x20]bits', -'microsoft\-webdav\-miniredir', -'mindcrawler', -'mindupbot', -'mini\-reptile', -'minirank', -'misterbot', -'miva', -'mizzu_labs', -'mnogosearch', -'moget', -'momspider', -'monster', -'motor', -'movabletype', -'ms[_+\s]search[_+\s]6\.0[_+\s]robot', -'ms_search_4\.0_robot', -'msnbot\-udiscovery', -'msrabot', -'msrbot', -'mt::telegraph::agent', -'muncher', -'muscatferret', -'mwdsearch', -'mydoyouhike', -'myweb', -'nagios', -'nasa_search', -'ndspider', -'nederland\.zoek', -'netcarta', -'netcraft', -'netluchs', -'netmechanic', -'netnewswire', -'netscoop', -'netsprint', -'netvibes', -'newrelicpinger', -'newscan\-online', -'newsfox', -'newsgatoronline', -'nextgensearchbot', -'nhse', -'nicebot', -'nimblecrawler', -'ning', -'nomad', -'northstar', -'noxtrumbot', -'npbot', -'nzexplorer', -'objectssearch', -'occam', -'ocelli', -'octopus', -'octora_beta_bot', -'onet\.pl[_+\s]sa', -'onfolio', -'openfind', -'opentaggerbot', -'openwebspider', -'optimizer', -'oracle_ultra_search', -'orb_search', -'orbiter', -'packrat', -'pageboy', -'panscient', -'parasite', -'passwordmaker\.org', -'patric', -'pear_http_request_class', -'peerbot', -'pegasus', -'perignator', -'perman', -'petersnews', -'phantom', -'php[_+\s]version[_+\s]tracker', -'phpcrawl', -'phpdig', -'picmole', -'pictureofinternet', -'piltdownman', -'pimptrain', -'ping\.blo\.gs', -'pingdom', -'pioneer', -'pita', -'pitkow', -'pjspider', -'plinki', -'pluckfeedcrawler', -'plumtreewebaccessor', -'pogodak', -'pompos', -'popdexter', -'poppi', -'port_huron_labs', -'portalb', -'postfavorites', -'postpost', -'postrank', -'powermarks', -'printfulbot', -'proodlebot', -'protopage', -'publiclibraryarchive', -'pyquery', -'python', -'qihoobot', -'quipply', -'qwantify', -'r6\_', -'rambler', -'ratingburner', -'raven', -'rbse', -'redalert', -'regator', -'relevantnoise\.com', -'resumerobot', -'rhcs', -'riddler', -'road_runner', -'robbie', -'robi', -'robocrawl', -'robofox', -'robozilla', -'rojo', -'rome[\x20]client', -'roverbot', -'rpt\-httpclient', -'rssgraffiti', -'rssimagesbot', -'ruffle', -'rufusbot', -'rules', -'safeads\.xyz', -'safetynetrobot', -'sage\+\+', -'sandcrawler', -'savetheworldheritage', -'sbider', -'schizozilla', -'scooter', -'scoutjet', -'scumbot', -'search\-info', -'search_au', -'searchguild[_+\s]dmoz[_+\s]experiment', -'searchmetricsbot', -'searchprocess', -'seekbot', -'semalt', -'senrigan', -'sensis_web_crawler', -'seodiver', -'seokicks\.de', -'seoscanners', -'sgscout', -'shaggy', -'shaihulud', -'shareaholicbot', -'shoutcast', -'sift', -'simbot', -'simplepie', -'sistrix', -'site\-valet', -'sitebot', -'sitedomain\-bot', -'sitetech', -'skimbot', -'skymob', -'slcrawler', -'slurp', -'slysearch', -'smartspider', -'smtbot', -'snap\.com_beta_crawler', -'snappy', -'snooper', -'sohu\-search', -'sohu', -'solbot', -'speedy', -'sphere_scout', -'spider[_+\s]monkey', -'spiderline', -'spiderlytics', -'spiderman', -'spiderview', -'spip', -'sproose_crawler', -'spry', -'sqworm', -'ssearcher', -'steeler', -'steroid__download', -'stq_bot', -'Stratagems[\x20]Kumo', -'suchfin\-bot', -'suke', -'summify\.com', -'sunrise', -'suntek', -'superbot', -'superfeedr', -'susie', -'sven', -'syndic8', -'syndicapi', -'synoobot', -'synthesio', -'t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', -'tach_bw', -'tagyu_agent', -'tailrank', -'tarantula', -'tarspider', -'tcl_http_client_package', -'techbot', -'technoratibot', -'templeton', -'teoma', -'teragramcrawlersurf', -'test_crawler', -'testbot', -'thumbsniper', -'titan', -'titin', -'tkwww', -'tlspider', -'topblogsinfo', -'topicblogs', -'topix\.net', -'trapit', -'trileet', -'turtlescanner', -'turtle', -'tutorgigbot', -'tweetedtimes', -'twiceler', -'twisted[\x20]pagegetter', -'twitterbot', -'twitterfeed', -'ubicrawler', -'ucsd', -'udmsearch', -'ultraseek', -'um\-IC', -'um\-LN', -'unchaos_bot_hybrid_web_search_engine', -'unido\-bot', -'unisterbot', -'universalfeedparser', -'unlost_web_crawler', -'unwindfetchor', -'updated', -'urlck', -'ustc\-semantic\-group', -'vagabondo\-wap', -'vagabondo', -'valkyrie', -'vermut', -'versus_crawler_from_eda\.baykan@epfl\.ch', -'verticrawl', -'vespa_crawler', -'victoria', -'virus[_+\s]detector', -'visionsearch', -'voidbot', -'voltron', -'vse/', -'vwbot', -'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', -'w3index', -'w3m2', -'wallpaper', -'wanderer', -'wapspider', -'wapspIRLider', -'watchmouse', -'wavefire', -'waybackarchive\.org', -'wazzup', -'web_downloader', -'webbandit', -'webbase', -'webcatcher', -'webclipping\.com', -'webcollage', -'webcompass', -'webcopy', -'webcrawl\.net', -'webdup', -'webfetcher', -'webfilter', -'webfoot', -'webinator', -'webindexer', -'weblayers', -'weblinker', -'webminer', -'webmirror', -'webmoose', -'webquest', -'webreader', -'webreaper', -'website[_+\s]monitoring[_+\s]bot', -'websnarf', -'webspider', -'webvac', -'webvulncrawl', -'webwalker', -'webwalk', -'webwatch', -'wells_search', -'wer\-liefert\-was', -'wesee:search', -'wevikabot', -'whatuseek', -'whowhere', -'windows\-rss\-platform', -'wired\-digital', -'zyborg', -'wisenutbot', -'wiumi', -'wmir', -'wolp', -'wombat', -'wonderer', -'woozweb', -'wordpress', -'worm', -'wume_crawler', -'wwwc', -'wwweasel', -'wz101', -'xget', -'xirq', -'xydo', -'y!j', -'yahoo![\x20]searchmonkey', -'yahoo!_mindset', -'yahoo\-blogs', -'yahoo\-mmcrawler', -'yahoo\-newscrawler', -'yahoo[\x20]pipes', -'yahoo\-verticalcrawler', -'yahoocachesystem', -'yahooexternalcache', -'yahoofeedseeker', -'yahooseeker\-testing', -'yahooseeker', -'yahooysmcm', -'yammer', -'yanga', -'yet\-another\-spider', -'yeti', -'yie8', -'yodaobot', -'yooglifetchagent', -'youdao', -'yourls', -'z\-add_link_checker', -'zealbot', -'zemanta', -'zend_http_client', -'zeus', -'zhuaxia', -'[^a]fish', -'[\x20]netseer[\x20]', -'^[1-3]$', -'^finbot', -'^motorola$', -'^msie', -'^voyager/', -'^webindex$', -'1\-more_scanner', -'nbot' -); - -@RobotsSearchIDOrder_listgen = ( -# Generic robot -'robot', -'blog', -'checker', -'crawl', -'discover', -'feed', -'fetcher', -'hunter', -'link', -'scanner', -'seek', -'sitemap', -'spider', -'sucker', -'survey', -'validator', -'bot[\s_+:,\.\;\/\\\-]', -'[\s_+:,\.\;\/\\\-]bot', -'curl', -'php', -'ruby/', -# Moving oBot here so it doesn't get assigned for other *obot robots -'oBot/', -'no_user_agent' -); - - -# RobotsHashIDLib -# List of robots names ('robot id','robot clear text') -#------------------------------------------------------- -%RobotsHashIDLib = ( -# Common robots (In robot file) -'bingbot/','bingbot', -'bingpreview','BingPreview', -'MSIECrawler','MSIECrawler', -'msnbot/','msnbot', -'msnbot\-media/','msnbot-media', -'AdIdxBot/','AdIdxBot Microsoft Ad Quality control', -'NOT[\x20]Googlebot/','NOT Googlebot', -'Googlebot/','Googlebot', -'Google[\x20]Web[\x20]Preview','Google Web Preview', -'Googlebot\-Image/','Googlebot-Image', -'Googlebot\-Mobile/','Googlebot-Mobile', -'Google[\x20]Page[\x20]Speed','Google Page Speed', -'google\-sitemaps','google-sitemaps', -'Googlebot\-News','Googlebot-News', -'Googlebot\-Video/','Googlebot-Video', -'AdsBot\-Google[\x20]\(','AdsBot-Google', -'AdsBot\-Google\-Mobile\-Apps','AdsBot-Google-Mobile-Apps', -'Adsbot','Adsbot', -'Mediapartners-Google','Mediapartners-Google', -'Feedfetcher\-Google','Feedfetcher-Google', -'Google\-Adwords\-Instant','Google-Adwords-Instant', -'Firefox/1\.5','Nautic Expo using Firefox/1.5', -'Yahoo![\x20]Slurp[\x20]China','Yahoo! Slurp China', -'Yahoo![\x20]Slurp','Yahoo! Slurp', -'Baiduspider/','Baiduspider', -'Baiduspider\-image','Baiduspider-image', -'Baiduspider-','Baiduspider ( catchall )', -'YandexBot/','YandexBot', -'YandexImages/','YandexImages', -'YandexImageResizer','YandexImageResizer', -'YandexMetrika/','YandexMetrika', -'YandexMobileBot/','YandexMobileBot', -'yandex','Yandex ( catchall )', -'electricmonk/','electricmonk', -'spbot/','spbot', -'SeznamBot/','SeznamBot', -'msie8','msie8 - ( Rogue Robot )', -'AhrefsBot/','AhrefsBot', -'007ac9[\x20]Crawler','007ac9 Crawler', -'2345Explorer/','2345Explorer', -'360Spider','360Spider', -'A[\x20]Simple[\x20]Crawler','A Simple Crawler', -'Abrave','Abrave', -'acapbot/','acapbot', -'Accoona\-AI\-Agent/','Accoona-AI-Agent', -'AdnormCrawlerCatchBot/','AdnormCrawlerCatchBot', -'adscanner','adscanner', -'aiHitBot/','aiHitBot', -'aipbot/','aipbot', -'AlphaBot','AlphaBot', -'Apache\-HttpClient/','Apache-HttpClient', -'Apexoo[\x20]Spider','Apexoo Spider', -'Applebot/','Applebot', -'arcemedia','AdsBot-ArceMedia', -'archive\.org_bot','archive.org_bot', -'Babya[\x20]Discoverer','Babya Discoverer', -'Barkrowler','Barkrowler', -'BDCbot/','BDCbot', -'BellPagesCA/','BellPagesCA', -'BeNosey[\x20]Mohawk[\x20]Search','BeNosey Mohawk Search', -'bhcBot','bhcBot', -'bidswitchbot','bidswitchbot', -'BigBozz/','BigBozz', -'BinGet/','BinGet', -'bitlybot','bit.ly', -'bl\.uk_lddc_bot/','bl.uk_lddc_bot', -'BLEXBot/','BLEXBot', -'bnf.fr_bot','bnf.fr_bot', -'boitho\.com\-dc/','boitho.com-dc', -'BoogleBot','BoogleBot', -'BusinessBot:','BusinessBot:', -'BW/','BW', -'Bytespider','Bytespider', -'CatchBot/','CatchBot', -'CB/Nutch','CB/Nutch', -'CCBot/','CCBot', -'CheckMarkNetwork/','CheckMarkNetwork', -'Cliqzbot/','Cliqzbot', -'CMS[\x20]Crawler','CMS Crawler', -'Companybook\-Crawler','Companybook-Crawler', -'ConveraCrawler/','ConveraCrawler', -'Contacts-Crawler','Contacts-Crawler', -'contxbot','contxbot', -'cosmos/','cosmos', -'CRMNLCrawlAgent','CRMNLCrawlAgent', -'crawl/Nutch','crawl/Nutch', -'crawler4j','crawler4j', -'CRAZYWEBCRAWLER','CRAZYWEBCRAWLER', -'CSE[\x20]HTML[\x20]Validator','CSE HTML Validator', -'C\-T[\x20]bot','C-T bot', -'CUBOT','CUBOT', -'Curl/PHP','Curl/PHP', -'cyencebot','cyencebot', -'DataCrawler/','DataCrawler', -'daumoa','daumoa', -'daum','daum', -'Deepnet[\x20]Explorer','Deepnet Explorer', -'DeuSu/','DeuSu', -'Digincore','Digincore', -'Discordbot/','Discordbot', -'Dispatch/','Dispatch', -'DnyzBot','DnyzBot', -'DoCoMo/','DoCoMo', -'Domain[\x20]Re\-Animator[\x20]Bot','Domain Re-Animator Bot', -'DomainCrawler/','DomainCrawler', -'DomainMacroCrawler/','DomainMacroCrawler', -'DomainSONOCrawler/','DomainSONOCrawler', -'DomainStatsBot/','DomainStatsBot', -'DotBot/','DotBot', -'DuckDuckBot-Https','DuckDuckBot-Https', -'DuckDuckBot','DuckDuckBot', -'DuckDuckGo\-Favicons\-Bot/','DuckDuckGo-Favicons-Bot', -'ELinks/','ELinks', -'ELinks[\x20]\(','ELinks (', -'EmailMarketingRobot/','EmailMarketingRobot', -'EmeraldShield\.com[\x20]WebBot','EmeraldShield.com WebBot', -'envolk\[ITS\]spider/','envolk ITS spider', -'eright','eright', -'EsperanzaBot','EsperanzaBot', -'Exabot/','Exabot', -'ExtLinksBot','ExtLinksBot', -'ExperianCrawlUK','ExperianCrawlUK', -'facebookexternalhit/','facebookexternalhit', -'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise crawleradmin.t-info@telekom.de', -'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise T-Info_BI_cluster crawleradmin.t-info@telekom.de', -'FAST\-WebCrawler/','FAST-WebCrawler', -'Feosey[\x20]Mohk[\x20]Crawler','Feosey Mohk Crawler', -'findlinks/','findlinks', -'Findxbot/','Findxbot', -'FirePHP/','FirePHP', -'firstdirectory\-bot','firstdirectory-bot', -'flamingo','Flamingo_SearchEngine', -'FlippyBearBot/','FlippyBearBot', -'^foo$','foo', -'Foregenix[\x20]Web[\x20]Scan','Foregenix Web Scan', -'FreeWebMonitoring[\x20]SiteChecker/','FreeWebMonitoring SiteChecker', -'fujilabol','fujilabol', -'FurlBot/','FurlBot', -'Gaisbot/','Gaisbot', -'Gallent[\x20]Spider','Gallent Spider', -'GarlikCrawler/','GarlikCrawler', -'Getintent[\x20]Crawler','GetIntent Crawler', -'GetintentCrawler[\x20]getintent\.com','GetintentCrawler getintent.com', -'Gigabot/','Gigabot', -'gipo\-crawler/Nutch','gipo-crawler/Nutch', -'Girafabot','Girafabot', -'Gluten[\x20]Free[\x20]Crawler/','Gluten Free Crawler', -'gocrawl','gocrawl', -'Gowikibot','Gowikibot', -'Go\-http\-client/','Go-http-client', -'GrapeshotCrawler/','GrapeshotCrawler', -'GSiteCrawler/','GSiteCrawler', -'GurujiBot/','GurujiBot', -'hadiBot','hadiBot', -'HaosouSpider','HaosouSpider', -'HELLO[\x20]Crawler','HELLO Crawler', -'holmes/','holmes', -'houzzbot','houzzbot', -'HTTP_Request2/','HTTP_Request2', -'HubSpot[\x20]Webcrawler','HubSpot Webcrawler', -'HyperCrawl/','HyperCrawl', -'ICC\-Crawler/','ICC-Crawler', -'iconoclast','iconoclast', -'IDGCrawler/Nutch','IDGCrawler/Nutch', -'IDG/UK','IDG/UK', -'idmarch[\x20]Automatic\.beta/','idmarch Automatic.beta', -'InbyBot','InbyBot', -'Incutio[\x20]XML','Incutio XML', -'IndeedBot','IndeedBot', -'InfluenceBot','InfluenceBot', -'IonCrawl','IonCrawl', -'IRLbot/','IRLbot', -'IssueCrawler','IssueCrawler', -'istellabot/','istellabot', -'James[\x20]BOT','James BOT', -'Jigsaw/','Jigsaw', -'JobFeed','JobFeed', -'Jooblebot','Jooblebot', -'KomodiaBot/','KomodiaBot', -'Konqueror/','Konqueror', -'laserlikebot','laserlikebot', -'Lightspeed','Lightspeed', -'linkapediabot','linkapediabot', -'metager\-linkchecker','metager-linkchecker', -'Linguee[\x20]Bot','Linguee Bot', -'linkchecker','linkchecker', -'LinkCheck','LinkCheck', -'linkdexbot/','linkdexbot', -'LinkedInBot/','LinkedInBot', -'LinkpadBot/','LinkpadBot', -'Links[\x20]\(','Links (', -'LinksManager\.com_bot','LinksManager.com_bot', -'LWP::Simple/','LWP::Simple', -'Mail\.RU_Bot/','Mail.RU Bot', -'makecontact','makecontact', -'mappy','Mappy Crawler', -'MauiBot','MauiBot', -'meanpathbot/','meanpathbot', -'Mechanize','Mechanize', -'Mediatoolkitbot','Mediatoolkitbot', -'MegaIndex\.ru/','MegaIndex.ru', -'merzscope','merzscope', -'Meta_Bot','Meta_Bot', -'mfibot/','mfibot', -'microsoft.*discovery','Microsoft Office Protocol Discovery', -'missigua_locator','missigua_locator', -'MixrankBot','MixrankBot', -'MJ12bot/','MJ12bot', -'MojeekBot','MojeekBot', -'Mojolicious','Mojolicious', -'MXT/Nutch','MXT/Nutch', -'My[\x20]Nutch[\x20]Spider/','My Nutch Spider', -'myse/Nutch','myse/Nutch', -'Naaraa','Naaraa', -'Neevabot','Neevabot', -'NerdyBot','NerdyBot', -'netEstate[\x20]NE[\x20]Crawler','netEstate NE Crawler', -'NetResearchServer/','NetResearchServer', -'Nimbostratus-Bot','Nimbostratus-Bot', -'nominet','nominet', -'NRLCorpusBuilder/Nutch','NRLCorpusBuilder/Nutch', -'nutch\-1\.4/','nutch-1.4', -'nutch\-1\.8/','nutch-1.8', -'NutchCVS/','NutchCVS', -'o\.uk[\x20]robot','o uk.robot', -'ocrawler;','ocrawler;', -'ODP[\x20]link[\x20]checker','ODP link checker', -'Offline[\x20]Explorer/','Offline Explorer', -'OmniExplorer_Bot/','OmniExplorer_Bot', -'OrangeBot/','OrangeBot', -'Orliac','Orliac', -'OutclicksBot','OutclicksBot', -'PageBitesHyperBot/','PageBitesHyperBot', -'Pcore','Pcore', -'pdffillerbot/','pdffillerbot', -'peopleman','peopleman', -'PetalBot','PetalBot', -'PhantomJS','PhantomJS', -'PHP/5\.2\.8','PHP/5.2.8', -'Pinterestbot','Pinterestbot', -'PiplBot','PiplBot', -'Ploetz[\x20]\+[\x20]Zeller','Ploetz + Zeller', -'Plukkie/','Plukkie', -'Princetonbot/','Princetonbot', -'PrivacyAwareBot/','PrivacyAwareBot', -'Prlog/','Prlog', -'proximic','proximic', -'psbot/','psbot', -'psbot\-image','psbot-image', -'python_wk_crawler','python_wk_crawler', -'Python\-urllib/','Python-urllib', -'QCrawl','QCrawl', -'Quick-Crawler','Quick-Crawler', -'ResearchBot','ResearchBot', -'roboto','roboto', -'rogerbot/','rogerbot', -'RSSingBot','RSSingBot', -'RukiCrawler/','RukiCrawler', -'SafeDNS[\x20]search[\x20]bot/','SafeDNS search bot', -'SafeDNSBot','SafeDNSBot', -'SafeSearch[\x20]microdata[\x20]crawler','SafeSearch microdata crawler', -'safesearch','safesearch ( catchall )', -'SBL\-BOT','SBL-BOT', -'scrapy','scrapy', -'Screaming[\x20]Frog[\x20]SEO[\x20]Spider/','Screaming Frog SEO Spider', -'ScreenerBot[\x20]Crawler[\x20]Beta','ScreenerBot Crawler Beta', -'Scrubby','Scrubby', -'Searchie/','Searchie', -'SecurityResearch\.bot','Security Research Bot', -'Seekmo','Seekmo', -'semanticbot','semanticbot', -'SemrushBot/','SemrushBot', -'SemrushBot-SI','SemrushBot-SI', -'seo\-audit\-check\-bot/','seo-audit-check-bot', -'Seobility','Seobility', -'SEOkicks\-Robot','SEOkicks-Robot', -'SEOlyticsCrawler/','SEOlyticsCrawler', -'SEOstats','SEOstats', -'Seosys/Nutch','Seosys/Nutch', -'Seoterritory\.com[\x20]bot','Seoterritory.com.bot', -'serendeputy','serendeputy', -'Shim\-Crawler','Shim-Crawler', -'SiteExplorer/','SiteExplorer', -'siteexplorer\.info','siteexplorer.info', -'siteimprove','siteimprove', -'Slackbot\-LinkExpanding','Slackbot-LinkExpanding', -'SmabblerBot/','SmabblerBot', -'Sogou[\x20]web[\x20]spider/','Sogou web spider', -'special_archiver/','special_archiver', -'Spiderbot/','Spiderbot', -'SpuhexBot','SpuhexBot', -'spyonweb','spyonweb', -'ssearch_bot','ssearch_bot', -'Streamline3Bot','Streamline3Bot', -'SurdotlyBot/','SurdotlyBot', -'SurveyBot/','SurveyBot', -'taiil/Nutch','taiil/Nutch', -'tbot\-nutch','tbot-nutch', -'TeeRaidBot','TeeRaidBot', -'TelegramBot','TelegramBot', -'Test/Nutch','Test/Nutch', -'Test[\x20]Spider','Test Spider', -'TestCrawler','TestCrawler', -'The[\x20]Knowledge[\x20]AI', 'The Knowledge AI', -'TkBot','TkBot', -'tracemyfile','tracemyfile', -'trendiction','trendiction', -'TurnitinBot/','TurnitinBot', -'TurnitinBot','TurnitinBot', -'TweetmemeBot/','TweetmemeBot', -'UCY/Nutch','UCY/Nutch', -'uni-leipzig\.de','uni-leipzig.de', -'Uptimebot/','Uptimebot', -'UptimeRobot/','UptimeRobot', -'URL[\x20]Checker','URL Checker', -'UXCrawlerBot','UXCrawlerBot', -'Validator\.nu/','Validator.nu', -'vBSEO','vBSEO', -'vBulletin[\x20]via[\x20]PHP','vBulletin via PHP', -'vebidoobot','vebidoobot', -'vegi[\x20]bot','vegi bot', -'Velen','Velen', -'viz/Nutch','viz/Nutch', -'VoilaBot','VoilaBot', -'VORTEX/','VORTEX', -'voyager/','voyager', -'vuhuvBot','vuhuvBot', -'W3C_Validator/','W3C_Validator', -'W3C\-checklink/','W3C-checklink', -'WBSearchBot/','WBSearchBot', -'WbSrch/','WbSrch/', -'WeSEE:Ads/PageBot','WeSEE:Ads/PageBot', -'WeSEE:Ads/PictureBot','WeSEE:Ads/PictureBot', -'WeSEE_Bot','WeSEE_Bot', -'Wget/','Wget', -'Who\.is[\x20]Bot','Who.is.Bot', -'wonderbot/','wonderbot', -'woobot/','woobot', -'Wotbox/','Wotbox', -'Xaldon[\x20]WebSpider','Xaldon WebSpider', -'Xenu[\x20]Link[\x20]Sleuth','Xenu Link Sleuth', -'xenu_link_sleuth','xenu_link_sleuth', -'XML[\x20]Sitemaps[\x20]Generator','XML Sitemaps Generator', -'XoviBot/','XoviBot', -'yacybot','yacybot', -'Yahoo[\x20]Link[\x20]Preview','Yahoo Link Preview', -'yak','yak-linkfluence', -'YisouSpider','YisouSpider', -'yoozBot','yoozBot', -'Your\-Website\-Sucks','Your-Website-Sucks', -'zoominfobot','zoominfobot', -'zspider/','zspider', -'ZumBot/','ZumBot', -'ng/1\.','ng/1.', -'ng/2\.','ng/2.', -'libwww\-perl','libwww-perl', -'urllib','urllib', -'javabee','javabee', -'projectwf\-java\-test\-crawler','projectwf-java-test-crawler', -'java','Java ( catchall )', -'loocalcrawler/nutch','loocalcrawler/nutch', -'nutchosu\-vlib','nutchosu-vlib', -'nutch','nutch ( catchall )', -'perlcrawler','perlcrawler', -'perl','perl', -'(firefox/)([0-9]\.|[0-1][0]\.)','Firefox version 10 and lower - various robots', - -# Less common robots (In robot file) -'^Mozilla$','Mozilla ( Rogue Robot )', -'^mozilla\/3\.0\s\(compatible$', 'mozilla/3.0 (compatible - ( Rogue Robot )', -'^mozilla\/4\.0$', 'mozilla/4.0 - ( Rogue Robot )', -'^mozilla\/4\.0\s\(compatible;\)$', 'mozilla/4.0 (compatible;) - ( Rogue Robot )', -'^mozilla\/5\.0$', 'mozilla/5.0 - ( Rogue Robot )', -'^mozilla\/5\.0\s\(compatible;$', 'mozilla/5.0 (compatible; - ( Rogue Robot )', -'^mozilla\/5\.0\s\(en\-us\)$', 'mozilla/5.0 (en-us) - ( Rogue Robot )', -'^mozilla\/5\.0\sfirefox\/3\.0\.5$', 'mozilla/5.0 firefox/3.0.5 - ( Rogue Robot )', -'^Mozilla/6\.0[\x20]\(compatible\)$','Mozilla/6.0 (compatible) - ( Rogue Robot )', -'^Mozilla/(.*)Beta[\x20]\(Windows\)','Mozilla Beta (Windows) - ( Rogue Robot )', -'MSIE[\x20]2','MSIE 2 - ( Rogue Robot )', -'MSIE[\x20]3','MSIE 3 - ( Rogue Robot )', -'MSIE[\x20]4','MSIE 4 - ( Rogue Robot )', -'MSIE[\x20]5','MSIE 5 - ( Rogue Robot )', -'MSIE[\x20]6','MSIE 6 - ( Rogue Robot )', -'MSIE\+6\.0\;','MSIE+6.0; - ( Rogue Robot)', -'Windows[\x20]95','Windows 95 - ( Rogue Robot )', -'Windows[\x20]98','Windows 99 - ( Rogue Robot )', - -# these could be removed to speed up processing as they are rarely seen -'a6\-indexer','a6-indexer', -'abcdatos','abcdatos', -'abonti\.com','abonti.com', -'acme\.spider','acme.spider', -'activebookmark','activebookmark', -'adamm_bot','adamm_bot', -'advbot','advbot', -'affectv\.co\.uk','affectv.co.uk', -'ahoythehomepagefinder','ahoythehomepagefinder', -'aleadsoftbot','aleadsoftbot', -'alkaline','alkaline', -'allrati','allrati', -'alltop','alltop', -'almaden','almaden', -'alpha_search_agent','alpha_search_agent', -'anthill','anthill', -'antibot','antibot', -'aport','aport', -'appie','appie', -'applesyndication','applesyndication', -'arachnophilia','arachnophilia', -'arale','arale', -'araneo','araneo', -'architext','architext', -'archive\-de\.com','archive-de.com', -'aretha','aretha', -'argus','argus', -'ariadne','ariadne', -'arianna\.libero\.it','arianna.libero.it', -'arks','arks', -'aspider','aspider', -'aspseek','aspseek', -'asterias','asterias', -'asynchttpclient','asynchttpclient', -'atn\.txt','atn.txt', -'atomz','atomz', -'auresys','auresys', -'awbot','awbot', -'backlinktest\.com','backlinktest.com', -'backrub','backrub', -'bbot','bbot', -'becomebot','becomebot', -'bender','bender', -'betabot','betabot', -'bigbrother','bigbrother', -'biglotron','biglotron', -'BingLocalSearch','BingLocalSearch', -'bittorrent_bot','bittorrent_bot', -'biz360[_+\s]spider','biz360 spider', -'bjaaland','bjaaland', -'blackwidow','blackwidow', -'blindekuh','blindekuh', -'blogbridge[_+\s]service','blogbridge service', -'blogged_crawl','blogged_crawl', -'bloglines','bloglines', -'bloglovin','bloglovin', -'blogpulse','blogpulse', -'blogsearch','blogsearch', -'blogshares','blogshares', -'blogslive','blogslive', -'blogssay','blogssay', -'bloodhound','bloodhound', -'bncf\.firenze\.sbn\.it/raccolta\.txt','bncf\.firenze\.sbn.it/raccolta.txt', -'bobby','bobby', -'bookmark\-manager','bookmark-manager', -'borg\-bot','borg-bot', -'boris','boris', -'brightnet','brightnet', -'bruinbot','bruinbot', -'bspider','bspider', -'bubing','bubing', -'bumblebee','bumblebee', -'butterfly','butterfly', -'buzztracker','buzztracker', -'cactvschemistryspider','cactvschemistryspider', -'calif[^r]','calif[^r]', -'candlelight[_+\s]favorites[_+\s]inspector','candlelight favorites inspector', -'careerbot','careerbot', -'carpathia','carpathia', -'cassandra','cassandra', -'catbot','catbot', -'cbn00glebot','cbn00glebot', -'cerberian_drtrs','cerberian_drtrs', -'cfetch','cfetch', -'cgireader','cgireader', -'chattertrap','chattertrap', -'check_http','check_http', -'checkbot','checkbot', -'checkweb_link_validator','checkweb_link_validator', -'christcrawler','christcrawler', -'churl','churl', -'cienciaficcion','cienciaficcion', -'cipinetbot','cipinetbot', -'imagecoccoc','imagecoccoc', -'coccoc','coccoc', -'coldfusion','coldfusion', -'collective','collective', -'combine','combine', -'commons\-httpclient','commons-httpclient', -'computer_and_automation_research_institute_crawler','computer_and_automation_research_institute_crawler', -'conceptbot','conceptbot', -'contentmatch','contentmatch', -'converamultimediacrawler','converamultimediacrawler', -'coolbot','coolbot', -'copubbot','copubbot', -'core','core', -'covario','covario', -'cruiser','cruiser', -'cscrawler','cscrawler', -'cuasarbot','cuasarbot', -'cursor','cursor', -'cusco','cusco', -'custo','custo', -'cyberspyder','cyberspyder', -'datafountains/dmoz_downloader','datafountains/dmoz_downloader', -'dataprovider\.com','dataprovider.com', -'daviesbot','daviesbot', -'daylifefeedfetcher','daylifefeedfetcher', -'daypopbot','daypopbot', -'deepindex','deepindex', -'desertrealm','desertrealm', -'deweb','deweb', -'dienstspider','dienstspider', -'digger','digger', -'digout4u','digout4u', -'diibot','diibot', -'dipsie\.bot','dipsie.bot', -'direct_hit','direct_hit', -'discobot','discobot', -'dlvr\.it','dlvr.it', -'dnabot','dnabot', -'dnsgroup','dnsgroup', -'doccheckbot','doccheckbot', -'domainappender','domainappender', -'domainchecker','domainchecker', -'domainsdb\.net','domainsdb.net', -'download_express','download_express', -'dragonbot','dragonbot', -'dreamwidth','dreamwidth', -'drupal','drupal', -'dulance','dulance', -'dumbot','dumbot', -'dumm\.de\-bot','dumm.de-bot', -'dwcp','dwcp', -'e\-collector','e-collector', -'earthcom\.info','earthcom.info', -'easydl','easydl', -'ebiness','ebiness', -'eccp','eccp', -'echo!','echo!', -'edgeio\-retriever','edgeio-retriever', -'elfinbot','elfinbot', -'emacs','emacs', -'emcspider','emcspider', -'enteprise','enteprise', -'ernst[:blank:]2\.0','ernst[:blank:]2.0', -'esther','esther', -'ets_v','ets_v', -'eventax','eventax', -'everbeecrawler','everbeecrawler', -'everest\-vulcan','everest-vulcan', -'evliyacelebi','evliyacelebi', -'exactseek','exactseek', -'extreme[_+\s]picture[_+\s]finder','extreme picture finder', -'ezoom','ezoom', -'ezresult','ezresult', -'facebook','facebook', -'facebot','facebot', -'fast\-search\-engine','fast-search-engine', -'matrix_s\.p\.a\._\-_fast_enterprise_crawler','matrix_s.p.a._-_fast_enterprise_crawler', -'fast_enterprise_crawler','fast_enterprise_crawler', -'fastbot','fastbot', -'fastcrawler','fastcrawler', -'favicon','favicon', -'favorg','favorg', -'favorites_sweeper','favorites_sweeper', -'fdse','fdse', -'feedburner','feedburner', -'feedcrawl','feedcrawl', -'feedflow','feedflow', -'feedmyinbox','feedmyinbox', -'feedroll\.com','feedroll.com', -'feedsky','feedsky', -'feedster','feedster', -'feedvalidator','feedvalidator', -'feedzira','feedzira', -'felix','felix', -'ferret','ferret', -'fetchbot','fetchbot', -'fetchrover','fetchrover', -'fever/','fever', -'fido','fido', -'filmkamerabot','filmkamerabot', -'filterdb\.iss\.net','filterdb.iss.net', -'finderlein[_+\s]research[_+\s]crawler','finderlein research crawler', -'findexa_crawler','findexa_crawler', -'finnish','finnish', -'fireball','fireball', -'firmilybot','firmilybot', -'flexum','flexum', -'foaf\-search\.net','foaf-search.net', -'fooky\.com/ScorpionBot','fooky.com/ScorpionBot', -'fouineur','fouineur', -'francoroute','francoroute', -'freecrawl','freecrawl', -'freenews','freenews', -'funnelweb','funnelweb', -'g2crawler','g2crawler', -'gama','gama', -'gazz','gazz', -'gcreep','gcreep', -'geniebot','geniebot', -'genieo','genieo', -'geohasher','geohasher', -'getbot','getbot', -'geturl','geturl', -'gigablastopensource','gigablastopensource', -'global_fetch','global_fetch', -'gnodspider','gnodspider', -'goforit\.com','goforit.com', -'goforitbot','goforitbot', -'golem','golem', -'gonzo','gonzo', -'gougou','gougou', -'gpu_p2p_crawler','gpu_p2p_crawler', -'grabber','grabber', -'grapeshot','grapeshot', -'grapnel','grapnel', -'griffon','griffon', -'gromit','gromit', -'grub','grub', -'gulliver','gulliver', -'gulperbot','gulperbot', -'hambot','hambot', -'hanrss','hanrss', -'harvest','harvest', -'havindex','havindex', -'henrythemiragorobot','henrythemiragorobot', -'heritrix','heritrix', -'hl_ftien_spider','hl_ftien_spider', -'hometown','hometown', -'hoowwwer','hoowwwer', -'hpprint','hpprint', -'htdig','htdig', -'html[_+\s]link[_+\s]validator','html link validator', -'htmlgobble','htmlgobble', -'htmlparser','htmlparser', -'httrack','httrack', -'hundesuche\.com\-bot','hundesuche.com-bot', -'hyperdecontextualizer','hyperdecontextualizer', -'ia_archiver\-web\.archive\.org','ia_archiver-web.archive.org', -'ia_archiver','ia_archiver', -'iajabot','iajabot', -'iaskspider','iaskspider', -'i\-bot','i-bot', -'icarus6j','icarus6j', -'ichiro','ichiro', -'icjobs\.de','icjobs.de', -'ilse','ilse', -'iltrovatore\-setaccio','iltrovatore-setaccio', -'imagelock','imagelock', -'implisensebot','implisensebot', -'inagist','inagist', -'incywincy','incywincy', -'infobot','infobot', -'infociousbot','infociousbot', -'infohelfer','infohelfer', -'infomine','infomine', -'informant','informant', -'infoseeksidewinder','infoseeksidewinder', -'infoseek','infoseek', -'infospider','infospider', -'inspectorwww','inspectorwww', -'insurancobot','insurancobot', -'integromedb\.org','integromedb.org', -'intelliagent','intelliagent', -'internet[_+\s]ninja','internet ninja', -'internetarchive','internetarchive', -'internetseer','internetseer', -'internetsupervision','internetsupervision', -'ips\-agent','ips-agent', -'irobot','irobot', -'iron33','iron33', -'isearch2006','isearch2006', -'israelisearch','israelisearch', -'iupui_research_bot','iupui_research_bot', -'izsearch','izsearch', -'jacobin[\x20]club','jacobin club', -'jakarta','jakarta', -'jbot','jbot', -'jcrawler','jcrawler', -'jeeves','jeeves', -'jennybot','jennybot', -'jobboerse','jobboerse', -'jobot','jobot', -'jobo','jobo', -'joebot','joebot', -'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility','jrtwine software check favorites utility', -'js\-kit','js-kit', -'jubii','jubii', -'jumpstation','jumpstation', -'justview','justview', -'kalambot','kalambot', -'kamano\.de_newsfeedverzeichnis','kamano.de_newsfeedverzeichnis', -'kapsi','kapsi', -'katipo','katipo', -'kazoombot','kazoombot', -'kevin','kevin', -'keyoshid','keyoshid', -'kilroy','kilroy', -'kinja\-imagebot','kinja-imagebot', -'kinjabot','kinjabot', -'knowitall','knowitall', -'knowledge\.com','knowledge.com', -'ko[_+\s]yappo[_+\s]robot','ko yappo robot', -'kouaa_krawler','kouaa_krawler', -'krugle','krugle', -'ksibot','ksibot', -'kummhttp','kummhttp', -'kurzor','kurzor', -'labelgrabber\.txt','labelgrabber.txt', -'lanshanbot','lanshanbot', -'larbin','larbin', -'largesmall[\x20]crawler','largesmall crawler', -'legs','legs', -'letscrawl\.com','letscrawl.com', -'libcrawl','libcrawl', -'lilina','lilina', -'link_valet_online','link_valet_online', -'linkbot','linkbot', -'linkdex\.com','linkdex.com', -'linkidator','linkidator', -'linkscan','linkscan', -'linkstats[\x20]bot','linkstats bot', -'linkwalker','linkwalker', -'lipperhey','lipperhey', -'livejournal\.com','livejournal.com', -'lmspider','lmspider', -'loadtimebot','loadtimebot', -'lockon','lockon', -'logo_gif','logo_gif', -'longurl','longurl', -'lssrocketcrawler','lssrocketcrawler', -'ltbot','ltbot', -'ltx71','ltx71', -'lwp\-request','lwp-request', -'lwp\-trivial','lwp-trivial', -'lycos[_+\s]','lycos ', -'macworm','macworm', -'madaali\.de','madaali.de', -'magpierss','magpierss', -'magpie','magpie', -'mapoftheinternet\.com','mapoftheinternet.com', -'marvin','marvin', -'mattie','mattie', -'mediabot','mediabot', -'mediafox','mediafox', -'megaindex','megaindex', -'megite','megite', -'memorybot','memorybot', -'mercator','mercator', -'meshexplorer','meshexplorer', -'metager2\-verification\-bot','metager2-verification-bot', -'metajobbot','metajobbot', -'metaspinner','metaspinner', -'metauri','metauri', -'miadev','miadev', -'microsoft[_+\s]url[_+\s]control','microsoft url control', -'microsoft[\x20]bits','microsoft bits', -'microsoft\-webdav\-miniredir','microsoft-webdav-miniredir', -'mindcrawler','mindcrawler', -'mindupbot','mindupbot', -'mini\-reptile','mini-reptile', -'minirank','minirank', -'misterbot','misterbot', -'miva','miva', -'mizzu_labs','mizzu_labs', -'mnogosearch','mnogosearch', -'moget','moget', -'momspider','momspider', -'monster','monster', -'motor','motor', -'movabletype','movabletype', -'ms[_+\s]search[_+\s]6\.0[_+\s]robot','ms search 6.0 robot', -'ms_search_4\.0_robot','ms_search_4.0_robot', -'msnbot\-udiscovery','msnbot-udiscovery', -'msrabot','msrabot', -'msrbot','msrbot', -'mt::telegraph::agent','mt::telegraph::agent', -'muncher','muncher', -'muscatferret','muscatferret', -'mwdsearch','mwdsearch', -'mydoyouhike','mydoyouhike', -'myweb','myweb', -'nagios','nagios', -'nasa_search','nasa_search', -'ndspider','ndspider', -'nederland\.zoek','nederland.zoek', -'netcarta','netcarta', -'netcraft','netcraft', -'netluchs','netluchs', -'netmechanic','netmechanic', -'netnewswire','netnewswire', -'netscoop','netscoop', -'netsprint','netsprint', -'netvibes','netvibes', -'newrelicpinger','newrelicpinger', -'newscan\-online','newscan-online', -'newsfox','newsfox', -'newsgatoronline','newsgatoronline', -'nextgensearchbot','nextgensearchbot', -'nhse','nhse', -'nicebot','nicebot', -'nimblecrawler','nimblecrawler', -'ning','ning', -'nomad','nomad', -'northstar','northstar', -'noxtrumbot','noxtrumbot', -'npbot','npbot', -'nzexplorer','nzexplorer', -'objectssearch','objectssearch', -'occam','occam', -'ocelli','ocelli', -'octopus','octopus', -'octora_beta_bot','octora_beta_bot', -'onet\.pl[_+\s]sa','onet.pl sa', -'onfolio','onfolio', -'openfind','openfind', -'opentaggerbot','opentaggerbot', -'openwebspider','openwebspider', -'optimizer','optimizer', -'oracle_ultra_search','oracle_ultra_search', -'orb_search','orb_search', -'orbiter','orbiter', -'packrat','packrat', -'pageboy','pageboy', -'panscient','panscient', -'parasite','parasite', -'passwordmaker\.org','passwordmaker.org', -'patric','patric', -'pear_http_request_class','pear_http_request_class', -'peerbot','peerbot', -'pegasus','pegasus', -'perignator','perignator', -'perman','perman', -'petersnews','petersnews', -'phantom','phantom', -'php[_+\s]version[_+\s]tracker','php version tracker', -'phpcrawl','phpcrawl', -'phpdig','phpdig', -'picmole','picmole', -'pictureofinternet','pictureofinternet', -'piltdownman','piltdownman', -'pimptrain','pimptrain', -'ping\.blo\.gs','ping.blo.gs', -'pingdom','pingdom', -'pioneer','pioneer', -'pita','pita', -'pitkow','pitkow', -'pjspider','pjspider', -'plinki','plinki', -'pluckfeedcrawler','pluckfeedcrawler', -'plumtreewebaccessor','plumtreewebaccessor', -'pogodak','pogodak', -'pompos','pompos', -'popdexter','popdexter', -'poppi','poppi', -'port_huron_labs','port_huron_labs', -'portalb','portalb', -'postfavorites','postfavorites', -'postpost','postpost', -'postrank','postrank', -'powermarks','powermarks', -'printfulbot','printfulbot', -'proodlebot','proodlebot', -'protopage','protopage', -'publiclibraryarchive','publiclibraryarchive', -'pyquery','pyquery', -'python','python', -'qihoobot','qihoobot', -'quipply','quipply', -'qwantify','qwantify', -'r6\_','r6\_', -'rambler','rambler', -'ratingburner','ratingburner', -'raven','raven', -'rbse','rbse', -'redalert','redalert', -'regator','regator', -'relevantnoise\.com','relevantnoise.com', -'resumerobot','resumerobot', -'rhcs','rhcs', -'riddler','riddler', -'road_runner','road_runner', -'robbie','robbie', -'robi','robi', -'robocrawl','robocrawl', -'robofox','robofox', -'robozilla','robozilla', -'rojo','rojo', -'rome[\x20]client','rome client', -'roverbot','roverbot', -'rpt\-httpclient','rpt-httpclient', -'rssgraffiti','rssgraffiti', -'rssimagesbot','rssimagesbot', -'ruffle','ruffle', -'rufusbot','rufusbot', -'rules','rules', -'safeads\.xyz','safeads.xyz', -'safetynetrobot','safetynetrobot', -'sage\+\+','sage++', -'sandcrawler','sandcrawler', -'savetheworldheritage','savetheworldheritage', -'sbider','sbider', -'schizozilla','schizozilla', -'scooter','scooter', -'scoutjet','scoutjet', -'scumbot','scumbot', -'search\-info','search-info', -'search_au','search_au', -'searchguild[_+\s]dmoz[_+\s]experiment','searchguild dmoz experiment', -'searchmetricsbot','searchmetricsbot', -'searchprocess','searchprocess', -'seekbot','seekbot', -'semalt','semalt', -'senrigan','senrigan', -'sensis_web_crawler','sensis_web_crawler', -'seodiver','seodiver', -'seokicks\.de','seokicks.de', -'seoscanners','seoscanners', -'sgscout','sgscout', -'shaggy','shaggy', -'shaihulud','shaihulud', -'shareaholicbot','shareaholicbot', -'shoutcast','shoutcast', -'sift','sift', -'simbot','simbot', -'simplepie','simplepie', -'sistrix','sistrix', -'site\-valet','site-valet', -'sitebot','sitebot', -'sitedomain\-bot','sitedomain-bot', -'sitetech','sitetech', -'skimbot','skimbot', -'skymob','skymob', -'slcrawler','slcrawler', -'slurp','slurp', -'slysearch','slysearch', -'smartspider','smartspider', -'smtbot','smtbot', -'snap\.com_beta_crawler','snap.com_beta_crawler', -'snappy','snappy', -'snooper','snooper', -'sohu\-search','sohu-search', -'sohu','sohu ( catchall )', -'solbot','solbot', -'speedy','speedy', -'sphere_scout','sphere_scout', -'spider[_+\s]monkey','spider monkey', -'spiderline','spiderline', -'spiderlytics','spiderlytics', -'spiderman','spiderman', -'spiderview','spiderview', -'spip','spip', -'sproose_crawler','sproose_crawler', -'spry','spry', -'sqworm','sqworm', -'ssearcher','ssearcher', -'steeler','steeler', -'steroid__download','steroid__download', -'stq_bot','stq_bot', -'Stratagems[\x20]Kumo','Stratagems Kumo', -'suchfin\-bot','suchfin-bot', -'suke','suke', -'summify\.com','summify.com', -'sunrise','sunrise', -'suntek','suntek', -'superbot','superbot', -'superfeedr','superfeedr', -'susie','susie', -'sven','sven', -'syndic8','syndic8', -'syndicapi','syndicapi', -'synoobot','synoobot', -'synthesio','synthesio', -'t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','t-h-u-n-d-e-r-s-t-o-n-e', -'tach_bw','tach_bw', -'tagyu_agent','tagyu_agent', -'tailrank','tailrank', -'tarantula','tarantula', -'tarspider','tarspider', -'tcl_http_client_package','tcl_http_client_package', -'techbot','techbot', -'technoratibot','technoratibot', -'templeton','templeton', -'teoma','teoma', -'teragramcrawlersurf','teragramcrawlersurf', -'test_crawler','test_crawler', -'testbot','testbot', -'thumbsniper','thumbsniper', -'titan','titan', -'titin','titin', -'tkwww','tkwww', -'tlspider','tlspider', -'topblogsinfo','topblogsinfo', -'topicblogs','topicblogs', -'topix\.net','topix.net', -'trapit','trapit', -'trileet','trileet', -'turtlescanner','turtlescanner', -'turtle','turtle', -'tutorgigbot','tutorgigbot', -'tweetedtimes','tweetedtimes', -'twiceler','twiceler', -'twisted[\x20]pagegetter','twisted pagegetter', -'twitterbot','twitterbot', -'twitterfeed','twitterfeed', -'ubicrawler','ubicrawler', -'ucsd','ucsd', -'udmsearch','udmsearch', -'ultraseek','ultraseek', -'um\-IC','ubermetrics-technologies.com', -'um\-LN','ubermetrics-technologies.com', -'unchaos_bot_hybrid_web_search_engine','unchaos_bot_hybrid_web_search_engine', -'unido\-bot','unido-bot', -'unisterbot','unisterbot', -'universalfeedparser','universalfeedparser', -'unlost_web_crawler','unlost_web_crawler', -'unwindfetchor','unwindfetchor', -'updated','updated', -'urlck','urlck', -'ustc\-semantic\-group','ustc-semantic-group', -'vagabondo\-wap','vagabondo-wap', -'vagabondo','vagabondo', -'valkyrie','valkyrie', -'vermut','vermut', -'versus_crawler_from_eda\.baykan@epfl\.ch','versus_crawler_from_eda.baykan@epfl.ch', -'verticrawl','verticrawl', -'vespa_crawler','vespa_crawler', -'victoria','victoria', -'virus[_+\s]detector','virus_detector', -'visionsearch','visionsearch', -'voidbot','voidbot', -'voltron','voltron', -'vse/','vse', -'vwbot','vwbot', -'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa','w3c_css_validator_jfouffa', -'w3index','w3index', -'w3m2','w3m2', -'wallpaper','wallpaper', -'wanderer','wanderer', -'wapspider','wapspider', -'wapspIRLider','wapspIRLider', -'watchmouse','watchmouse', -'wavefire','wavefire', -'waybackarchive\.org','waybackarchive.org', -'wazzup','wazzup', -'web_downloader','web_downloader', -'webbandit','webbandit', -'webbase','webbase', -'webcatcher','webcatcher', -'webclipping\.com','webclipping.com', -'webcollage','webcollage', -'webcompass','webcompass', -'webcopy','webcopy', -'webcrawl\.net','webcrawl.net', -'webdup','webdup', -'webfetcher','webfetcher', -'webfilter','webfilter', -'webfoot','webfoot', -'webinator','webinator', -'webindexer','webindexer', -'weblayers','weblayers', -'weblinker','weblinker', -'webminer','webminer', -'webmirror','webmirror', -'webmoose','webmoose', -'webquest','webquest', -'webreader','webreader', -'webreaper','webreaper', -'website[_+\s]monitoring[_+\s]bot','website monitoring bot', -'websnarf','websnarf', -'webspider','webspider', -'webvac','webvac', -'webvulncrawl','webvulncrawl', -'webwalker','webwalker', -'webwalk','webwalk', -'webwatch','webwatch', -'wells_search','wells_search', -'wer\-liefert\-was','wer-liefert-was', -'wesee:search','wesee:search', -'wevikabot','wevikabot', -'whatuseek','whatuseek', -'whowhere','whowhere', -'windows\-rss\-platform','windows-rss-platform', -'wired\-digital','wired-digital', -'zyborg','zyborg', -'wisenutbot','wisenutbot', -'wiumi','wiumi', -'wmir','wmir', -'wolp','wolp', -'wombat','wombat', -'wonderer','wonderer', -'woozweb','woozweb', -'wordpress','wordpress', -'worm','worm', -'wume_crawler','wume_crawler', -'wwwc','wwwc', -'wwweasel','wwweasel', -'wz101','wz101', -'xget','xget', -'xirq','xirq', -'xydo','xydo', -'y!j','y!j', -'yahoo![\x20]searchmonkey','yahoo! searchmonkey', -'yahoo!_mindset','yahoo!_mindset', -'yahoo\-blogs','yahoo-blogs', -'yahoo\-mmcrawler','yahoo-mmcrawler', -'yahoo\-newscrawler','yahoo-newscrawler', -'yahoo[\x20]pipes','yahoo pipes', -'yahoo\-verticalcrawler','yahoo-verticalcrawler', -'yahoocachesystem','yahoocachesystem', -'yahooexternalcache','yahooexternalcache', -'yahoofeedseeker','yahoofeedseeker', -'yahooseeker\-testing','yahooseeker-testing', -'yahooseeker','yahooseeker', -'yahooysmcm','yahooysmcm', -'yammer','yammer', -'yanga','yanga', -'yet\-another\-spider','yet-another-spider', -'yeti','yeti', -'yie8','yie8', -'yodaobot','yodaobot', -'yooglifetchagent','yooglifetchagent', -'youdao','youdao', -'yourls','yourls', -'z\-add_link_checker','z-add_link_checker', -'zealbot','zealbot', -'zemanta','zemanta', -'zend_http_client','zend_http_client', -'zeus','zeus', -'zhuaxia','zhuaxia', -'[^a]fish','[^a]fish', -'[\x20]netseer[\x20]',' netseer ', -'^[1-3]$','^[1-3]$', -'^finbot','^finbot', -'^motorola$','^motorola$', -'^msie','^msie', -'^voyager/','^voyager', -'^webindex$','webindex', -'1\-more_scanner','1-more_scanner', -# below placed at end to catch some generics -'nbot','nbot', - -# Generic robot -'robot','robot', -'blog','blog', -'checker','checker', -'crawl','crawl', -'discover','discover', -'feed','feed', -'fetcher','fetcher', -'hunter','hunter', -'link','link', -'scanner','scanner', -'seek','seek', -'sitemap','sitemap', -'spider','spider', -'sucker','sucker', -'survey','survey', -'validator','validator', -'bot[\s_+:,\.\;\/\\\-]','Unknown robot identified by bot\*', -'[\s_+:,\.\;\/\\\-]bot','Unknown robot identified by \*bot', -'curl','Curl', -'php','A PHP script', -'ruby/','Ruby script', -'no_user_agent','empty user agent string', -# Moving oBot towards the end so it does not pick up other *obot robots -'oBot/','oBot', -# Unknown robots identified by hit on robots.txt -'unknown','Unknown robot (identified by hit on robots.txt)' -); - - -# RobotsAffiliateLib -# This list try to tell by which Search Engine a robot is used -#------------------------------------------------------------- -%RobotsAffiliateLib = ( -); - -1; diff --git a/test/awstats/conf/awstats.testnginx.conf b/test/awstats/conf/awstats.testnginx.conf new file mode 100644 index 00000000..ffb43cd6 --- /dev/null +++ b/test/awstats/conf/awstats.testnginx.conf @@ -0,0 +1,806 @@ +# AWStats configure file +#------------------------------------------------------------------------ +# Copy this file into awstats.www.myserver.mydomain.conf or awstats.conf +# and edit this new file to setup AWStats. +# If you don't understand what is a parameter, keep default value. +#------------------------------------------------------------------------ + + +# Main setup section (Required to /test AWStats working) +#------------------------------------------------------------------------ + +# "LogFile" contains the web server log file to analyze. +# Possible values: A full path, or a relative path from awstats.pl directory. +# Example: "/var/log/apache/access.log" +# Example: "../logs/mycombinedlog.log" +# You can also use tags in this filename if you need a dynamic file name +# depending on date or time (Replacement is made by AWStats at the beginning +# of its execution). This is available tags : +# %YYYY-n is replaced with 4 digits year we were n hours ago +# %YY-n is replaced with 2 digits year we were n hours ago +# %MM-n is replaced with 2 digits month we were n hours ago +# %MO-n is replaced with 3 letters month we were n hours ago +# %DD-n is replaced with day we were n hours ago +# %HH-n is replaced with hour we were n hours ago +# %NS-n is replaced with number of seconds at 00:00 since 1970 +# %WM-n is replaced with the week number in month (1-5) +# %Wm-n is replaced with the week number in month (0-4) +# %WY-n is replaced with the week number in year (01-52) +# %Wy-n is replaced with the week number in year (00-51) +# %DW-n is replaced with the day number in week (1-7, 1=sunday) +# use n=24 if you need (1-7, 1=monday) +# %Dw-n is replaced with the day number in week (0-6, 0=sunday) +# use n=24 if you need (0-6, 0=monday) +# Use 0 for n if you need current year, month, day, hour... +# Example: "/var/log/access_log.%YYYY-0%MM-0%DD-0.log" +# Example: "C:/WINNT/system32/LogFiles/W3SVC1/ex%YY-24%MM-24%DD-24.log" +# You can also use a pipe if log file come from a pipe. +# Example: "gzip -d ) : +# +# If code is not added in index page, all this detection capabilities will be +# disabled. You must also check that ShowScreenSizeStats and ShowMiscStats +# parameters are set to 1 to make results appear in report page. +# If you change this parameter, you must also change the +# awstatsmisctrackerurl variable into the awstats_misc_tracker.js file. +# Change : Effective for new updates only. +# Possible value: Name of javascript tracker file added in HTML code +# Default: "/js/awstats_misc_tracker.js" +# +MiscTrackerUrl="/js/awstats_misc_tracker.js" + + +# Add here a list of kind of url (file extension) that must be counted as +# "Hit only" and not as a "Hit" and "Page/Download". You can set here all +# images extensions as they are hit downloaded that must be counted but they +# are not viewed pages. URLs with such extensions are not included in the TOP +# Pages/URL report. +# Note: If you want to exclude particular URLs from stats (No Pages and no +# Hits reported), you must use SkipFiles parameter. +# Change : Effective for new updates only +# Example: "css js class gif jpg jpeg png bmp ico zip arj gz z wav mp3 wma mpg" +# Example: "" +# Default: "css js class gif jpg jpeg png bmp ico" +# +NotPageList="css js class gif jpg jpeg png bmp ico" + + +# Default index page name for your web server. +# Change : Effective for new updates only +# Example: "index.php index.html default.html" +# Default: "index.html" +# +DefaultFile="index.php index.html" + + + + +#------------------------------------------------------------------------ +# Optionnal setup section (Not required but increase AWStats features) +#------------------------------------------------------------------------ + +# Set your primary language. +# Possible value: +# Albanian=al, Bosnian=ba, Bulgarian=bg, Catalan=ca, +# Chinese (Taiwan)=tw, Chinese (Simpliefied)=cn, Czech=cz, Danish=dk, +# Dutch=nl, English=en, Estonian=et, Euskara=eu, Finnish=fi, +# French=fr, Galician=gl, German=de, Greek=gr, Hebrew=he, Hungarian=hu, +# Icelandic=is, Indonesian=id, Italian=it, Japanese=jp, Korean=kr, +# Latvian=lv, Norwegian (Nynorsk)=nn, Norwegian (Bokmal)=nb, Polish=pl, +# Portuguese=pt, Portuguese (Brazilian)=br, Romanian=ro, Russian=ru, +# Serbian=sr, Slovak=sk, Spanish=es, Swedish=se, Turkish=tr, Ukrainian=ua, +# Welsh=wlk. +# First available language accepted by browser=auto +# Default: "auto" +# +Lang="auto" + + +# Do not include access from clients that match following criteria. +# If your log file contains IP adresses in host field, you must enter here +# matching IP adresses criteria. +# If DNS lookup is already done in your log file, you must enter here hostname +# criteria, else enter ip address criteria. +# The opposite parameter of "SkipHosts" is "OnlyHosts". +# Note: Use space between each value. This parameter is not case sensitive. +# Note: You can use regular expression values writing value with REGEX[value]. +# Change : Effective for new updates only +# Example: "127.0.0.1 REGEX[^192\.168\.] REGEX[^10\.0\.0\.]" +# Example: "localhost REGEX[^.*\.localdomain$]" +# Default: "" +# +SkipHosts="" + + +# Do not include access from clients with a user agent that match following +# criteria. If you want to exclude a robot, you should update the robots.pm +# file instead of this parameter. +# Note: Use space between each value. This parameter is not case sensitive. +# Note: You can use regular expression values writing value with REGEX[value]. +# Change : Effective for new updates only +# Example: "konqueror REGEX[ua_test_v\d\.\d]" +# Default: "" +# +SkipUserAgents="" + + +# Use SkipFiles to ignore access to URLs that match one of following entries. +# You can enter a list of not important URLs (like framed menus, hidden pages, +# etc...) to exclude them from statistics. You must enter here exact relative +# URL as found in log file, or a matching REGEX value. +# For example, to ignore /badpage.html, just add "/badpage.html". To ignore +# all pages in a particular directory, add "REGEX[^\/directorytoexclude]". +# The opposite parameter of "SkipFiles" is "OnlyFiles". +# Note: Use space between each value. This parameter is not case sensitive. +# Note: You can use regular expression values writing value with REGEX[value]. +# Change : Effective for new updates only +# Example: "/badpage.html REGEX[^\/excludedirectory]" +# Default: "" +# +SkipFiles="" + + +# Some web servers on some Operating systems (IIS-Windows) considers that a +# login with same value but different case are the same login. To tell AWStats +# to also considers them as one, set this parameter to 1. +# Possible values: 0 or 1 +# Default: 0 +# +AuthenticatedUsersNotCaseSensitive=1 + + +# Keep or remove the anchor string you can find in some URLs. +# Possible values: 0 or 1 +# Default: 0 +# +URLWithAnchor=0 + + +# In URL links, "?" char is used to add parameter's list in URLs. Syntax is: +# /mypage.html?param1=value1¶m2=value2 +# However, some servers/sites use also others chars to isolate dynamic part of +# their URLs. You can complete this list with all such characters. +# Change : Effective for new updates only +# Example: "?;," +# Default: "?;" +# +URLQuerySeparators="?;" + + +# Keep or remove the query string to the URL in the statistics for individual +# pages. This is primarily used to differentiate between the URLs of dynamic +# pages. If set to 1, mypage.html?id=x and mypage.html?id=y are counted as two +# different pages. +# Warning, when set to 1, memory required to run AWStats is dramatically +# increased if you have a lot of changing URLs (for example URLs with a random +# id inside). Such web sites should not set this option to 1 or use seriously +# the next parameter URLWithQueryWithOnlyFollowingParameters (or eventually +# URLWithQueryWithoutFollowingParameters). +# Change : Effective for new updates only +# Possible values: +# 0 - URLs are cleaned from the query string (ie: "/mypage.html") +# 1 - Full URL with query string is used (ie: "/mypage.html?p=x&q=y") +# Default: 0 +# +URLWithQuery=1 + + +# When URLWithQuery is on, you will get the full URL with all parameters in +# URL reports. But among thoose parameters, sometimes you don't need a +# particular parameter because it does not identify the page or because it's +# a random ID changing for each access even if URL points to same page. In +# such cases, it is higly recommanded to ask AWStats to keep only parameters +# you need (if you know them) before counting, manipulating and storing it. +# Enter here list of wanted parameters. For example, with "param", one hit on +# /mypage.cgi?param=abc&id=Yo4UomP9d and /mypage.cgi?param=abc&id=Mu8fdxl3r +# will be reported as 2 hits on /mypage.cgi?param=abc +# This parameter is not used when URLWithQuery is 0 and can't be used with +# URLWithQueryWithoutFollowingParameters. +# Change : Effective for new updates only +# Example: "param" +# Default: "" +# +URLWithQueryWithOnlyFollowingParameters="" + + +# When URLWithQuery is on, you will get the full URL with all parameters in +# URL reports. But among thoose parameters, sometimes you don't need a +# particular parameter because it does not identify the page or because it's +# a random ID changing for each access even if URL points to same page. In +# such cases, it is higly recommanded to ask AWStats to remove such parameters +# from the URL before counting, manipulating and storing it. Enter here list +# of all non wanted parameters. For example if you enter "id", one hit on +# /mypage.cgi?p=abc&id=Yo4UomP9d and /mypage.cgi?p=abc&id=Mu8fdxl3r +# will be reported as 2 hits on /mypage.cgi?p=abc +# This parameter is not used when URLWithQuery is 0 and can't be used with +# URLWithQueryWithOnlyFollowingParameters. +# Change : Effective for new updates only +# Example: "PHPSESSID jsessionid" +# Default: "" +# +URLWithQueryWithoutFollowingParameters="productId jsessionid" + + +# Keep or remove the query string to the referrer URL in the statistics for +# external referrer pages. This is used to differentiate between the URLs of +# dynamic referrer pages. If set to 1, mypage.html?id=x and mypage.html?id=y +# are counted as two different referrer pages. +# Change : Effective for new updates only +# Possible values: +# 0 - Referrer URLs are cleaned from the query string (ie: "/mypage.html") +# 1 - Full URL with query string is used (ie: "/mypage.html?p=x&q=y") +# Default: 0 +# +URLReferrerWithQuery=0 + + +# AWStats can detect setup problems or show you important informations to have +# a better use. Keep this to 1, except if AWStats says you can change it. +# Possible values: 1 or 0 +# Default: 1 +WarningMessages=1 + + + +#----------------------------------------------------------------------------- +# OPTIONAL ACCURACY SETUP SECTION (Not required but increase AWStats features) +#----------------------------------------------------------------------------- + +# Following values allows you to define accuracy of AWStats entities (robots, +# browsers, os, referers, file types) detection. +# It is recommanded that very important web sites or ISP that provides AWStats +# to their customer set this parameter to 1 (or 0), instead of 2. +# Possible values: +# 0 = No detection, +# 1 = Medium/Standard detection +# 2 = Full detection +# Change : Effective for new updates only +# Default: 2 (0 for LevelForWormsDetection) +# +LevelForBrowsersDetection=2 # 0 disables Browsers detection. +LevelForOSDetection=2 # 0 disables OS detection. +LevelForRefererAnalyze=2 # 0 disables Origin detection. +LevelForRobotsDetection=2 # 0 disables Robots detection. +LevelForSearchEnginesDetection=2 # 0 disables Search engines detection. +LevelForKeywordsDetection=2 # 0 disables Keyphrases/Keywords detection. +LevelForFileTypesDetection=1 # 0 disables File types detection. +LevelForWormsDetection=2 # 0 disables Worms detection. + + + +#----------------------------------------------------------------------------- +# OPTIONAL APPEARANCE SETUP SECTION (Not required but increase AWStats features) +#----------------------------------------------------------------------------- + +# When you use AWStats as a CGI, you can have the reports shown in HTML frames. +# Frames are only available for report viewed dynamically. When you build +# pages from command line, this option is not used and no frames are built. +# Possible values: 0 or 1 +# Default: 1 +# +UseFramesWhenCGI=1 + + +# Each URL shown in stats page are links you can click. +# Possible values: 1 or 0 +# Default: 1 +# +ShowLinksOnUrl=1 + + +# List of visible flags that links to other language translations. +# See Lang parameter for list of allowed flag/language codes. +# If you don't want any flag link, set ShowFlagLinks to "". +# This parameter is used only if ShowMenu parameter is set to 1. +# Possible values: "" or "language_codes_separated_by_space" +# Example: "en es fr nl es" +# Default: "" +# +ShowFlagLinks="fr" + + +# Search engines keywords reported are full search string or separate keywords +# Possible values: +# 0 - Search keywords reported are full search string (ie: "town maps") +# 1 - Search keywords reported are separated words (ie: "town" and "maps") +# Default: 0 +# +SplitSearchString=0 + + +# You can put here HTML code that will be added at the end of AWStats reports. +# Great to add advert ban. +# Default: "" +# +HTMLEndSection="" + + +# Value of maximum bar width/heigth for horizontal/vertical graphics bar +# Default: 260/220 +# +BarWidth = 260 +BarHeight = 220 + + +# This value can be used to choose maximum number of lines shown for each +# particular reporting. +# +# Stats by domains +MaxNbOfDomain = 25 +# Stats by hosts +MaxNbOfHostsShown = 25 +MinHitHost = 1 +# Stats by authenticated users +MaxNbOfLoginShown = 5 +MinHitLogin = 1 +# Stats by robots +MaxNbOfRobotShown = 25 +MinHitRobot = 1 +# Stats by pages +MaxNbOfPageShown = 25 +MinHitFile = 1 +# Stats by referers +MaxNbOfRefererShown = 25 +MinHitRefer = 1 +# Stats for keywords +MaxNbOfKeywordsShown = 25 +MinHitKeyword = 1 + + +ShowHeader=1 # Show AWStats head title and icon +ShowMenu=1 # Show menu header with links on detailed reports +ShowMonthDayStats=1 +ShowDaysOfWeekStats=1 +ShowHoursStats=1 +ShowDomainsStats=1 +ShowHostsStats=1 +ShowAuthenticatedUsers=1 +ShowRobotsStats=1 +ShowPagesStats=1 +ShowCompressionStats=0 # Show report of compression stats when using mod_gzip +ShowFileTypesStats=1 +ShowFileSizesStats=0 # Not yet available +ShowBrowsersStats=1 +ShowOSStats=1 +ShowOriginStats=1 +ShowKeyphrasesStats=1 +ShowKeywordsStats=1 +ShowHTTPErrorsStats=1 +ShowWormsStats=1 +# Show misc chart +# Default: a (See also MiscTrackerUrl parameter), Possible codes: ajdfrqwp +ShowMiscStats=ajdfrqwp +ShowScreenSizeStats=1 + + +# In the Origin chart, you have stats on where your hits came from. You can +# includes hits on pages that comes from pages of same sites in this chart. +# Possible values: 0 or 1 +# Default: 0 +# +IncludeInternalLinksInOriginSection=1 + + + +#----------------------------------------------------------------------------- +# PLUGINS +#----------------------------------------------------------------------------- + +# Add here all plugin files you want to load. +# Plugin files must be .pm files stored in 'plugins' directory. +# Uncomment LoadPlugin lines to enable a plugin after checking that perl +# modules required by the plugin are installed. + +# PLUGIN: Tooltips +# REQUIRED MODULES: None +# PARAMETERS: None +# DESCRIPTION: Add tooltips pop-up help boxes to HTML report pages. +# NOTE: This will increased HTML report pages size, thus server load and bandwidth. +# +#LoadPlugin="tooltips" + +# PLUGIN: DecodeUTFKeys +# REQUIRED MODULES: Encode and URI::Escape +# PARAMETERS: None +# DESCRIPTION: Allow AWStats to show correctly (in language charset) +# keywords/keyphrases strings even if they were UTF8 coded by the +# referer search engine. +# +#LoadPlugin="decodeutfkeys" + +# PLUGIN: IPv6 +# PARAMETERS: None +# REQUIRED MODULES: Net::IP and Net::DNS +# DESCRIPTION: This plugin gives AWStats capability to make reverse DNS +# lookup on IPv6 addresses. +# +#LoadPlugin="ipv6" + +# PLUGIN: HashFiles +# REQUIRED MODULES: Storable +# PARAMETERS: None +# DESCRIPTION: AWStats DNS cache files are read/saved as native hash files. +# This increases DNS cache files loading speed, above all for very large web sites. +# +#LoadPlugin="hashfiles" + + +# PLUGIN: UserInfo +# REQUIRED MODULES: None +# PARAMETERS: None +# DESCRIPTION: Add a text (Firtname, Lastname, Office Department, ...) in +# authenticated user reports for each login value. +# A text file called userinfo.myconfig.txt, with two fields (first is login, +# second is text to show, separated by a tab char) must be created in DirData +# directory. +# +#LoadPlugin="userinfo" + +# PLUGIN: HostInfo +# REQUIRED MODULES: Net::XWhois +# PARAMETERS: None +# DESCRIPTION: Add a column into host chart with a link to open a popup window that shows +# info on host (like whois records). +# +#LoadPlugin="hostinfo" + +# PLUGIN: ClusterInfo +# REQUIRED MODULES: None +# PARAMETERS: None +# DESCRIPTION: Add a text (for example a full hostname) in cluster reports for each cluster +# number. A text file called clusterinfo.myconfig.txt, with two fields (first is +# cluster number, second is text to show) separated by a tab char. must be +# created into DirData directory. +# Note this plugin is useless if ShowClusterStats is set to 0 or if you don't +# use a personalized log format that contains %cluster tag. +# +#LoadPlugin="clusterinfo" + +# PLUGIN: UrlAliases +# REQUIRED MODULES: None +# PARAMETERS: None +# DESCRIPTION: Add a text (Page title, description...) in URL reports before URL value. +# A text file called urlalias.myconfig.txt, with two fields (first is URL, +# second is text to show, separated by a tab char) must be created into +# DirData directory. +# +#LoadPlugin="urlalias" + +# PLUGIN: TimeHiRes +# REQUIRED MODULES: Time::HiRes (if Perl < 5.8) +# PARAMETERS: None +# DESCRIPTION: Time reported by -showsteps option is in millisecond. For debug purpose. +# +#LoadPlugin="timehires" + +# PLUGIN: TimeZone +# REQUIRED MODULES: Time::Local +# PARAMETERS: [timezone offset] +# DESCRIPTION: Allow AWStats to adjust time stamps for a different timezone +# This plugin reduces AWStats speed of 10% !!!!!!! +# LoadPlugin="timezone" +# LoadPlugin="timezone +2" +# LoadPlugin="timezone CET" +# +#LoadPlugin="timezone +2" + +# PLUGIN: Rawlog +# REQUIRED MODULES: None +# PARAMETERS: None +# DESCRIPTION: This plugin adds a form in AWStats main page to allow users to see raw +# content of current log files. A filter is also available. +# +#LoadPlugin="rawlog" + +# PLUGIN: GraphApplet +# REQUIRED MODULES: None +# PARAMETERS: [CSS classes to override] +# DESCRIPTION: Supported charts are built by a 3D graphic applet. +# +#LoadPlugin="graphapplet /awstatsclasses" # EXPERIMENTAL FEATURE + +# PLUGIN: GraphGoogleChartAPI +# REQUIRED MODULES: None +# PARAMETERS: None +# DESCRIPTION: Replaces the standard charts with free Google API generated images +# in HTML reports. If country data is available and more than one country has hits, +# a map will be generated using Google Visualizations. +# Note: The machine where reports are displayed must have Internet access for the +# charts to be generated. The only data sent to Google includes the statistic numbers, +# legend names and country names. +# Warning: This plugin is not compatible with option BuildReportFormat=xhtml. +# +#LoadPlugin="graphgooglechartapi" + +# PLUGIN: GeoIPfree +# REQUIRED MODULES: Geo::IPfree version 0.2+ (from Graciliano M.P.) +# PARAMETERS: None +# DESCRIPTION: Country chart is built from an Internet IP-Country database. +# This plugin is useless for intranet only log files. +# Note: You must choose between using this plugin (need Perl Geo::IPfree +# module, database is free but not up to date) or the GeoIP plugin (need +# Perl Geo::IP module from Maxmind, database is also free and up to date). +# Note: Activestate provide a corrupted version of Geo::IPfree 0.2 Perl +# module, so install it from elsewhere (from www.cpan.org for example). +# This plugin reduces AWStats speed by up to 10% ! +# +#LoadPlugin="geoipfree" + +# MAXMIND GEO IP MODULES: Please see documentation for notes on all Maxmind modules + +# PLUGIN: GeoIP +# REQUIRED MODULES: Geo::IP or Geo::IP::PurePerl (from Maxmind) +# PARAMETERS: [GEOIP_STANDARD | GEOIP_MEMORY_CACHE] [/pathto/geoip.dat] +# DESCRIPTION: Builds a country chart and adds an entry to the hosts +# table with country name +# Replace spaces in the path of geoip data file with string "%20". +# +LoadPlugin="geoip GEOIP_STANDARD /home/ldestailleur/git/awstats/test/maxmind/GeoIP.dat" + +# PLUGIN: GeoIP_City_Maxmind +# REQUIRED MODULES: Geo::IP or Geo::IP::PurePerl (from Maxmind) +# PARAMETERS: [GEOIP_STANDARD | GEOIP_MEMORY_CACHE] [/pathto/GeoIPCity.dat] +# DESCRIPTION: This plugin adds a column under the hosts field and tracks the pageviews +# and hits by city including regions. +# Replace spaces in the path of geoip data file with string "%20". +# +LoadPlugin="geoip_city_maxmind GEOIP_STANDARD /home/ldestailleur/git/awstats/test/maxmind/GeoIPCity-532.dat" + +# PLUGIN: GeoIP_ASN_Maxmind +# REQUIRED MODULES: Geo::IP or Geo::IP::PurePerl (from Maxmind) +# PARAMETERS: [GEOIP_STANDARD | GEOIP_MEMORY_CACHE] [/pathto/GeoIPASN.dat[+/pathto/override.txt][+http://linktoASlookup]] +# DESCRIPTION: This plugin adds a chart of AS numbers where the host IP address is registered. +# This plugin can display some ISP information if included in the database. You can also provide +# a link that will be used to lookup additional registration data. Put the link at the end of +# the parameter string and the report page will include the link with the full AS number at the end. +# Replace spaces in the path of geoip data file with string "%20". +# +#LoadPlugin="geoip_asn_maxmind GEOIP_STANDARD /home/ldestailleur/git/awstats/test/maxmind/GeoIP.dat+http://enc.com.au/itools/aut-num.php?autnum=" + +# PLUGIN: GeoIP_Region_Maxmind +# REQUIRED MODULES: Geo::IP or Geo::IP::PurePerl (from Maxmind) +# PARAMETERS: [GEOIP_STANDARD | GEOIP_MEMORY_CACHE] [/pathto/GeoIPRegion.dat] +# DESCRIPTION:This plugin adds a chart of hits by regions. Only regions for US and +# Canada can be detected. +# Replace spaces in the path of geoip data file with string "%20". +# +LoadPlugin="geoip_region_maxmind GEOIP_STANDARD /home/ldestailleur/git/awstats/test/maxmind/GeoIPRegion-515.dat" + +# PLUGIN: GeoIP_ISP_Maxmind +# REQUIRED MODULES: Geo::IP or Geo::IP::PurePerl (from Maxmind) +# PARAMETERS: [GEOIP_STANDARD | GEOIP_MEMORY_CACHE] [/pathto/GeoIPISP.dat] +# DESCRIPTION: This plugin adds a chart of hits by ISP. +# Replace spaces in the path of geoip data file with string "%20". +# +#LoadPlugin="geoip_isp_maxmind GEOIP_STANDARD /home/ldestailleur/git/awstats/test/maxmind/GeoIPISP-122.dat" + +# PLUGIN: GeoIP_Org_Maxmind +# REQUIRED MODULES: Geo::IP or Geo::IP::PurePerl (from Maxmind) +# PARAMETERS: [GEOIP_STANDARD | GEOIP_MEMORY_CACHE] [/pathto/GeoIPOrg.dat] +# DESCRIPTION: This plugin add a chart of hits by Organization name +# Replace spaces in the path of geoip data file with string "%20". +# +LoadPlugin="geoip_org_maxmind GEOIP_STANDARD /home/ldestailleur/git/awstats/test/maxmind/GeoIPOrg-111.dat" + + + +#----------------------------------------------------------------------------- +# EXTRA SECTION +#----------------------------------------------------------------------------- + +# WARNING: Extra sections are experimental feature not stable yet !!! + +# You can define your own charts, you choose here what are rows and columns +# keys. This feature is particularly useful for marketing purpose, tracking +# products orders for example. +# For this, edit all parameters of Extra section. Each set of parameter is a +# different chart. For several charts, duplicate section changing the number. +# Note that each Extra section reduces AWStats speed by 10%. +# +# WARNING: A wrong setup of Extra section can result in a too large arrays +# that will consume all your memory, making AWStats unusable after several +# updates, so be sure to setup it correctly. +# In most cases, you don't need this feature. +# +# ExtraSectionNameX is title of your personalized chart. +# ExtraSectionConditionalX are conditions on URL and/or QUERY_STRING and/or +# REFERER you can use to count or not the hit. Use "|" for "OR". +# ExtraSectionFirstColumnTitleX is the first column title of the chart. +# ExtraSectionFirstColumnValuesX is a Regex string to tell AWStats how to +# extract the value used for first column. Each different value found will +# be a different row. Be sure that list of different values is "limited" to +# avoid "not enough memory" problems ! +# ExtraSectionStatTypesX are things you want to count. You can use standard +# code letters (P for pages,H for hits,B for bandwidth,L for last access). +# MaxNbOfExtraX is maximum number of rows shown in chart. +# MinHitExtraX is minimum number of hits required to be shown in chart. +# + +# Example to report the 20 products the most ordered by "order.cgi" script +ExtraSectionName1="Product orders" +ExtraSectionCondition1="URL,\/cgi\-bin\/order\.cgi|URL,\/cgi-bin2\/order\.cgi" +ExtraSectionFirstColumnTitle1="Product ID" +ExtraSectionFirstColumnValues1="QUERY_STRING,productId=([^&]+)" +ExtraSectionStatTypes1=PL +MaxNbOfExtra1=20 +MinHitExtra1=1 + + +ExtraSectionName2="Redirect" +ExtraSectionCondition2="URL,\/cgi\-bin\/awredir\.pl" +ExtraSectionFirstColumnTitle2="Url" +ExtraSectionFirstColumnValues2="QUERY_STRING,url=([^&]+)" +ExtraSectionStatTypes2=HL +MaxNbOfExtra2=20 +MinHitExtra2=1 diff --git a/test/test.pl b/test/test.pl index 9655d1a1..5251bd48 100755 --- a/test/test.pl +++ b/test/test.pl @@ -13,38 +13,42 @@ $PERL="perl"; @TESTLIST=( "testglobal", -"testlogins", -"testworms", -"testipv6", -"testdnsdone", -"testextra", -"testgeoip", -"testgeoip_region_maxmind", -"testgeoip_city_maxmind", +"testsmall", +"testnginx", +"testtime5", +#"testlogins", +#"testworms", +#"testipv6", +#"testdnsdone", +#"testextra", +#"testgeoip", +#"testgeoip_region_maxmind", +#"testgeoip_city_maxmind", "testgeoip_isp_maxmind", -"testgeoip_org_maxmind", -"testrobot", -"benchmark", -"testmoddeflate","testmodgzip","testmodgzip2","testmodgzip3", -"testurlwithquery", -"testwindowsmediaserver","testwindowsmediaserver9","testrealmediaserver","testdarwinserver", -"testsquidextended", -"testisa1", -"testisa2", -"testlotus", -"testlotus65", -"testwebstar", -"testzope", -"testcluster", -"testoracle9ias", -"testproftp","testproftp2","testvsftpd", -"testskipfiles", -"testvirtualhosts", -"testsendmail", -"testpostfix", -"testpostfix1", -"testpostfix4", -"testexchange"); +#"testgeoip_org_maxmind", +#"testrobot", +#"benchmark", +#"testmoddeflate","testmodgzip","testmodgzip2","testmodgzip3", +#"testurlwithquery", +#"testwindowsmediaserver","testwindowsmediaserver9","testrealmediaserver","testdarwinserver", +#"testsquidextended", +#"testisa1", +#"testisa2", +#"testlotus", +#"testlotus65", +#"testwebstar", +#"testzope", +#"testcluster", +#"testoracle9ias", +#"testproftp","testproftp2","testvsftpd", +#"testskipfiles", +#"testvirtualhosts", +#"testsendmail", +#"testpostfix", +#"testpostfix1", +#"testpostfix4", +#"testexchange" +); #@TESTLIST=("testglobal","testsmall","testtime5"); #@TESTLIST=("testlogins"); @@ -88,7 +92,7 @@ while(1==1) { print "Choose test to execute...\n"; - sprintf("$02i %s",0,"All"); + sprintf("$2i %s", 0, "All"); my $i=1; foreach my $key (@TESTLIST) { print sprintf("%02i) %s\n",$i,$key); @@ -106,7 +110,7 @@ while(1==1) else { push @chosen, $TESTLIST[$bidon-1]; } # Option output - print "Choose output option (browserdetail, osdetail, ...)\n"; + print "Choose output option ('', 'browserdetail', 'osdetail', ...)\n"; $bidon=''; print "Your choice : "; $bidon=;