From: eldy <> Date: Sat, 1 Dec 2001 19:56:25 +0000 (+0000) Subject: Increase speed by 21% with new robot detection algorithm. X-Git-Tag: AWSTATS_1_0~14 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4cd005af1fc8caf8b6ce6eb4afa48e76571e859a;p=thirdparty%2FAWStats.git Increase speed by 21% with new robot detection algorithm. --- diff --git a/wwwroot/cgi-bin/db/robots.pl b/wwwroot/cgi-bin/db/robots.pl index 31c2315b..e565b417 100644 --- a/wwwroot/cgi-bin/db/robots.pl +++ b/wwwroot/cgi-bin/db/robots.pl @@ -1,280 +1,351 @@ # AWSTATS ROBOTS DATABASE #------------------------ -# Last update: 2001-10-20 +# Last update: 2001-12-02 + + +# List of active major robots +@RobotArrayID_major = ( +"googlebot", +"tcl", +"ia_archiver", +"scooter", +"fast-Webcrawler", +"bjaaland", +"echo", +"jeeves", +"voila", +"voyager", +"mercator", +"linkWalker", +"slurp", +"wisenutbot", +"gulliver", +"myweb", +"wget", +"architextspider", +"webbase", +"muscatferret", +"lycos", +"nomad", +"appie", +"weblayers", +"moget", +"unlost_web_crawler", +"antibot", +"harvest", +"ferret", +"jennybot", +"peternews", +"htdig" +); + +@RobotArrayID_generic = ( +# Generic robot +"robot" +); + # Robot name list ("os id","os clear text") #------------------------------------------------------- -# Main list of robots (found at http://info.webcrawler.com/mak/projects/robots/active.html) -# This command show how to generate tab list from this file: cat robotslist.txt | sed 's/:/ /' | awk ' /robot-id/ { name=tolower($2); } /robot-name/ { print "\""name"\", \""$0"\"," } ' | sed 's/robot-name *//g' > file +%RobotHashIDLib = ( +# Robots found at http://www.robotstxt.org/wc/active/all.txt # Rem: To avoid bad detection, some robots id were removed from this list: # - Robots with ID of 2 letters only # - Robot called "webs" # Rem: directhit is changed into direct_hit (its real id) # Rem: calif is changed into calif[^r] to avoid confusion between tiscalifreenet browser -%RobotHashIDLib = ( -"acme.spider", "Acme.Spider", -"ahoythehomepagefinder", "Ahoy! The Homepage Finder", -"alkaline", "Alkaline", -"appie", "Walhello appie", -"arachnophilia", "Arachnophilia", -"architext", "ArchitextSpider", -"aretha", "Aretha", -"ariadne", "ARIADNE", -"aspider", "ASpider (Associative Spider)", -"atn.txt", "ATN Worldwide", -"atomz", "Atomz.com Search Robot", -"auresys", "AURESYS", -"backrub", "BackRub", -"bigbrother", "Big Brother", -"bjaaland", "Bjaaland", -"blackwidow", "BlackWidow", -"blindekuh", "Die Blinde Kuh", -"bloodhound", "Bloodhound", -"brightnet", "bright.net caching robot", -"bspider", "BSpider", -"cactvschemistryspider", "CACTVS Chemistry Spider", -"calif[^r]", "Calif", -"cassandra", "Cassandra", -"cgireader", "Digimarc Marcspider/CGI", -"checkbot", "Checkbot", -"churl", "churl", -"cmc", "CMC/0.01", -"collective", "Collective", -"combine", "Combine System", -"conceptbot", "Conceptbot", -"core", "Web Core / Roots", -"cshkust", "CS-HKUST WISE: WWW Index and Search Engine", -"cusco", "Cusco", -"cyberspyder", "CyberSpyder Link Test", -"deweb", "DeWeb(c) Katalog/Index", -"dienstspider", "DienstSpider", -"diibot", "Digital Integrity Robot", -"direct_hit", "Direct Hit Grabber", -"dnabot", "DNAbot", -"download_express", "DownLoad Express", -"dragonbot", "DragonBot", -"dwcp", "DWCP (Dridus' Web Cataloging Project)", -"ebiness", "EbiNess", -"eit", "EIT Link Verifier Robot", -"emacs", "Emacs-w3 Search Engine", -"emcspider", "ananzi", -"esther", "Esther", -"evliyacelebi", "Evliya Celebi", -"fdse", "Fluid Dynamics Search Engine robot", -"felix", " Felix IDE", -"ferret", "Wild Ferret Web Hopper #1, #2, #3", -"fetchrover", "FetchRover", -"fido", "fido", -"finnish", "Hämähäkki", -"fireball", "KIT-Fireball", -"fish", "Fish search", -"fouineur", "Fouineur", -"francoroute", "Robot Francoroute", -"freecrawl", "Freecrawl", -"funnelweb", "FunnelWeb", -"gazz", "gazz", -"gcreep", "GCreep", -"getbot", "GetBot", -"geturl", "GetURL", -"golem", "Golem", -"googlebot", "Googlebot", -"grapnel", "Grapnel/0.01 Experiment", -"griffon", "Griffon", -"gromit", "Gromit", -"gulliver", "Northern Light Gulliver", -"hambot", "HamBot", -"harvest", "Harvest", -"havindex", "havIndex", -"hometown", "Hometown Spider Pro", -"wired-digital", "Wired Digital", -"htdig", "ht://Dig", -"htmlgobble", "HTMLgobble", -"hyperdecontextualizer", "Hyper-Decontextualizer", -"ibm", "IBM_Planetwide", -"iconoclast", "Popular Iconoclast", -"ilse", "Ingrid", -"imagelock", "Imagelock ", -"incywincy", "IncyWincy", -"informant", "Informant", -"infoseek", "InfoSeek Robot 1.0", -"infoseeksidewinder", "Infoseek Sidewinder", -"infospider", "InfoSpiders", -"inspectorwww", "Inspector Web", -"intelliagent", "IntelliAgent", -"iron33", "Iron33", -"israelisearch", "Israeli-search", -"javabee", "JavaBee", -"jcrawler", "JCrawler", -"jeeves", "Jeeves", -"jobot", "Jobot", -"joebot", "JoeBot", -"jubii", "The Jubii Indexing Robot", -"jumpstation", "JumpStation", -"katipo", "Katipo", -"kdd", "KDD-Explorer", -"kilroy", "Kilroy", -"ko_yappo_robot", "KO_Yappo_Robot", -"labelgrabber.txt", "LabelGrabber", -"larbin", "larbin", -"legs", "legs", -"linkscan", "LinkScan", -"linkwalker", "LinkWalker", -"lockon", "Lockon", -"logo_gif", "logo.gif Crawler", -"lycos", "Lycos", -"macworm", "Mac WWWWorm", -"magpie", "Magpie", -"mediafox", "MediaFox", -"merzscope", "MerzScope", -"meshexplorer", "NEC-MeshExplorer", -"mindcrawler", "MindCrawler", -"moget", "moget", -"momspider", "MOMspider", -"monster", "Monster", -"motor", "Motor", -"muscatferret", "Muscat Ferret", -"mwdsearch", "Mwd.Search", -"myweb", "Internet Shinchakubin", -"netcarta", "NetCarta WebMap Engine", -"netmechanic", "NetMechanic", -"netscoop", "NetScoop", -"newscan-online", "newscan-online", -"nhse", "NHSE Web Forager", -"nomad", "Nomad", -"northstar", "The NorthStar Robot", -"nzexplorer", "nzexplorer", -"occam", "Occam", -"octopus", "HKU WWW Octopus", -"orb_search", "Orb Search", -"packrat", "Pack Rat", -"pageboy", "PageBoy", -"parasite", "ParaSite", -"patric", "Patric", -"perignator", "The Peregrinator", -"perlcrawler", "PerlCrawler 1.0", -"phantom", "Phantom", -"piltdownman", "PiltdownMan", -"pioneer", "Pioneer", -"pitkow", "html_analyzer", -"pjspider", "Portal Juice Spider", -"pka", "PGP Key Agent", -"plumtreewebaccessor", "PlumtreeWebAccessor", -"poppi", "Poppi", -"portalb", "PortalB Spider", -"puu", "GetterroboPlus Puu", -"python", "The Python Robot", -"raven", "Raven Search", -"rbse", "RBSE Spider", -"resumerobot", "Resume Robot", -"rhcs", "RoadHouse Crawling System", -"roadrunner", "Road Runner: The ImageScape Robot", -"robbie", "Robbie the Robot", -"robi", "ComputingSite Robi/1.0", -"roverbot", "Roverbot", -"safetynetrobot", "SafetyNet Robot", -"scooter", "Scooter", -"search_au", "Search.Aus-AU.COM", -"searchprocess", "SearchProcess", -"senrigan", "Senrigan", -"sgscout", "SG-Scout", -"shaggy", "ShagSeeker", -"shaihulud", "Shai'Hulud", -"sift", "Sift", -"simbot", "Simmany Robot Ver1.0", -"site-valet", "Site Valet", -"sitegrabber", "Open Text Index Robot", -"sitetech", "SiteTech-Rover", -"slurp", "Inktomi Slurp", -"smartspider", "Smart Spider", -"snooper", "Snooper", -"solbot", "Solbot", -"spanner", "Spanner", -"speedy", "Speedy Spider", -"spider_monkey", "spider_monkey", -"spiderbot", "SpiderBot", -"spiderman", "SpiderMan", -"spry", "Spry Wizard Robot", -"ssearcher", "Site Searcher", -"suke", "Suke", -"sven", "Sven", -"tach_bw", "TACH Black Widow", -"tarantula", "Tarantula", -"tarspider", "tarspider", -"tcl", "Tcl W3 Robot", -"techbot", "TechBOT", -"templeton", "Templeton", -"titin", "TitIn", -"titan", "TITAN", -"tkwww", "The TkWWW Robot", -"tlspider", "TLSpider", -"ucsd", "UCSD Crawl", -"udmsearch", "UdmSearch", -"urlck", "URL Check", -"valkyrie", "Valkyrie", -"victoria", "Victoria", -"visionsearch", "vision-search", -"voyager", "Voyager", -"vwbot", "VWbot", -"w3index", "The NWI Robot", -"w3m2", "W3M2", -"wanderer", "the World Wide Web Wanderer", -"webbandit", "WebBandit Web Spider", -"webcatcher", "WebCatcher", -"webcopy", "WebCopy", -"webfetcher", "webfetcher", -"webfoot", "The Webfoot Robot", -"weblayers", "Weblayers", -"weblinker", "WebLinker", -"webmirror", "WebMirror", -"webmoose", "The Web Moose", -"webquest", "WebQuest", -"webreader", "Digimarc MarcSpider", -"webreaper", "WebReaper", -"websnarf", "Websnarf", -"webspider", "WebSpider", -"webvac", "WebVac", -"webwalk", "webwalk", -"webwalker", "WebWalker", -"webwatch", "WebWatch", -"wget", "Wget", -"whowhere", "WhoWhere Robot", -"wmir", "w3mir", -"wolp", "WebStolperer", -"wombat", "The Web Wombat ", -"worm", "The World Wide Web Worm", -"wwwc", "WWWC Ver 0.2.5", -"wz101", "WebZinger", -"xget", "XGET", -"nederland.zoek", "Nederland.zoek", - -# Not declared robots -"antibot", "Antibot (Not referenced robot)", -"cscrawler","CsCrawler (Not referenced robot)", -"daviesbot", "DaviesBot (Not referenced robot)", -"ezresult", "Ezresult (Not referenced robot)", -"fast-webcrawler", "Fast-Webcrawler (Not referenced robot)", -"gnodspider","GNOD Spider (Not referenced robot)", -"jennybot", "JennyBot (Not referenced robot)", -"justview", "JustView (Not referenced robot)", -"mercator", "Mercator (Not referenced robot)", -"perman surfer", "Perman surfer (Not referenced robot)", -"redalert", "Red Alert (Not referenced robot)", -"shoutcast","Shoutcast Directory Service (Not referenced robot)", -"unlost_web_crawler", "Unlost_Web_Crawler (Not referenced robot)", -"webbase", "WebBase (Not referenced robot)", -"wisenutbot","WISENutbot (Not referenced robot)", -"yandex", "Yandex bot (Not referenced robot)", -# Supposed to be robots -"boris", "Boris (Not referenced robot)", -"digout4u", "digout4u (Not referenced robot)", -"echo", "EchO! (Not referenced robot)", -"ia_archiver", "ia_archiver (Not referenced robot)", -"ultraseek", "Ultraseek (Not referenced robot)", -"voila", "Voila (Not referenced robot)", -"webcompass", "webcompass (Not referenced robot)", -# Generic ID -"robot", "Unknown robot (Not referenced robot)" +"acme.spider","Acme.Spider", +"ahoythehomepagefinder","Ahoy! The Homepage Finder", +"alkaline","Alkaline", +"appie","Walhello appie", +"arachnophilia","Arachnophilia", +"architext","ArchitextSpider", +"aretha","Aretha", +"ariadne","ARIADNE", +"arks","arks", +"aspider","ASpider (Associative Spider)", +"atn.txt","ATN Worldwide", +"atomz","Atomz.com Search Robot", +"auresys","AURESYS", +"backrub","BackRub", +"bigbrother","Big Brother", +"bjaaland","Bjaaland", +"blackwidow","BlackWidow", +"blindekuh","Die Blinde Kuh", +"bloodhound","Bloodhound", +"brightnet","bright.net caching robot", +"bspider","BSpider", +"cactvschemistryspider","CACTVS Chemistry Spider", +"calif","Calif", +"cassandra","Cassandra", +"cgireader","Digimarc Marcspider/CGI", +"checkbot","Checkbot", +"churl","churl", +"cmc","CMC/0.01", +"collective","Collective", +"combine","Combine System", +"conceptbot","Conceptbot", +"coolbot","CoolBot", +"core","Web Core / Roots", +"cosmos","XYLEME Robot", +"cruiser","Internet Cruiser Robot", +"cusco","Cusco", +"cyberspyder","CyberSpyder Link Test", +"deweb","DeWeb(c) Katalog/Index", +"dienstspider","DienstSpider", +"digger","Digger", +"diibot","Digital Integrity Robot", +"directhit","Direct Hit Grabber", +"dnabot","DNAbot", +"download_express","DownLoad Express", +"dragonbot","DragonBot", +"dwcp","DWCP (Dridus' Web Cataloging Project)", +"e-collector","e-collector", +"ebiness","EbiNess", +"eit","EIT Link Verifier Robot", +"elfinbot","ELFINBOT", +"emacs","Emacs-w3 Search Engine", +"emcspider","ananzi", +"esther","Esther", +"evliyacelebi","Evliya Celebi", +"nzexplorer","nzexplorer", +"fdse","Fluid Dynamics Search Engine robot", +"felix","Felix IDE", +"ferret","Wild Ferret Web Hopper #1, #2, #3", +"fetchrover","FetchRover", +"fido","fido", +"finnish","Hämähäkki", +"fireball","KIT-Fireball", +"fish","Fish search", +"fouineur","Fouineur", +"francoroute","Robot Francoroute", +"freecrawl","Freecrawl", +"funnelweb","FunnelWeb", +"gama","gammaSpider, FocusedCrawler", +"gazz","gazz", +"gcreep","GCreep", +"getbot","GetBot", +"geturl","GetURL", +"golem","Golem", +"googlebot","Googlebot", +"grapnel","Grapnel/0.01 Experiment", +"griffon","Griffon", +"gromit","Gromit", +"gulliver","Northern Light Gulliver", +"hambot","HamBot", +"harvest","Harvest", +"havindex","havIndex", +"hometown","Hometown Spider Pro", +"wired-digital","Wired Digital", +"htdig","ht://Dig", +"htmlgobble","HTMLgobble", +"hyperdecontextualizer","Hyper-Decontextualizer", +"iajabot","iajaBot", +"ibm","IBM_Planetwide", +"iconoclast","Popular Iconoclast", +"ilse","Ingrid", +"imagelock","Imagelock", +"incywincy","IncyWincy", +"informant","Informant", +"infoseek","InfoSeek Robot 1.0", +"infoseeksidewinder","Infoseek Sidewinder", +"infospider","InfoSpiders", +"inspectorwww","Inspector Web", +"intelliagent","IntelliAgent", +"irobot","I, Robot", +"iron33","Iron33", +"israelisearch","Israeli-search", +"javabee","JavaBee", +"jbot","JBot Java Web Robot", +"jcrawler","JCrawler", +"jeeves","Jeeves", +"jobo","JoBo Java Web Robot", +"jobot","Jobot", +"joebot","JoeBot", +"jubii","The Jubii Indexing Robot", +"jumpstation","JumpStation", +"katipo","Katipo", +"kdd","KDD-Explorer", +"kilroy","Kilroy", +"ko_yappo_robot","KO_Yappo_Robot", +"labelgrabber.txt","LabelGrabber", +"larbin","larbin", +"legs","legs", +"linkidator","Link Validator", +"linkscan","LinkScan", +"linkwalker","LinkWalker", +"lockon","Lockon", +"logo_gif","logo.gif Crawler", +"lycos","Lycos", +"macworm","Mac WWWWorm", +"magpie","Magpie", +"marvin","marvin/infoseek", +"mattie","Mattie", +"mediafox","MediaFox", +"merzscope","MerzScope", +"meshexplorer","NEC-MeshExplorer", +"mindcrawler","MindCrawler", +"moget","moget", +"momspider","MOMspider", +"monster","Monster", +"motor","Motor", +"muscatferret","Muscat Ferret", +"mwdsearch","Mwd.Search", +"myweb","Internet Shinchakubin", +"netcarta","NetCarta WebMap Engine", +"netmechanic","NetMechanic", +"netscoop","NetScoop", +"newscan-online","newscan-online", +"nhse","NHSE Web Forager", +"nomad","Nomad", +"northstar","The NorthStar Robot", +"occam","Occam", +"octopus","HKU WWW Octopus", +"openfind","Openfind data gatherer", +"orb_search","Orb Search", +"packrat","Pack Rat", +"pageboy","PageBoy", +"parasite","ParaSite", +"patric","Patric", +"pegasus","pegasus", +"perignator","The Peregrinator", +"perlcrawler","PerlCrawler 1.0", +"phantom","Phantom", +"piltdownman","PiltdownMan", +"pimptrain","Pimptrain.com's robot", +"pioneer","Pioneer", +"pitkow","html_analyzer", +"pjspider","Portal Juice Spider", +"pka","PGP Key Agent", +"plumtreewebaccessor","PlumtreeWebAccessor", +"poppi","Poppi", +"portalb","PortalB Spider", +"puu","GetterroboPlus Puu", +"python","The Python Robot", +"raven","Raven Search", +"rbse","RBSE Spider", +"resumerobot","Resume Robot", +"rhcs","RoadHouse Crawling System", +"roadrunner","Road Runner: The ImageScape Robot", +"robbie","Robbie the Robot", +"robi","ComputingSite Robi/1.0", +"robofox","RoboFox", +"robozilla","Robozilla", +"roverbot","Roverbot", +"rules","RuLeS", +"safetynetrobot","SafetyNet Robot", +"scooter","Scooter", +"search_au","Search.Aus-AU.COM", +"searchprocess","SearchProcess", +"senrigan","Senrigan", +"sgscout","SG-Scout", +"shaggy","ShagSeeker", +"shaihulud","Shai'Hulud", +"sift","Sift", +"simbot","Simmany Robot Ver1.0", +"site-valet","Site Valet", +"sitegrabber","Open Text Index Robot", +"sitetech","SiteTech-Rover", +"slcrawler","SLCrawler", +"slurp","Inktomi Slurp", +"smartspider","Smart Spider", +"snooper","Snooper", +"solbot","Solbot", +"spanner","Spanner", +"speedy","Speedy Spider", +"spider_monkey","spider_monkey", +"spiderbot","SpiderBot", +"spiderline","Spiderline Crawler", +"spiderman","SpiderMan", +"spiderview","SpiderView(tm)", +"spry","Spry Wizard Robot", +"ssearcher","Site Searcher", +"suke","Suke", +"suntek","suntek search engine", +"sven","Sven", +"tach_bw","TACH Black Widow", +"tarantula","Tarantula", +"tarspider","tarspider", +"tcl","Tcl W3 Robot", +"techbot","TechBOT", +"templeton","Templeton", +"teoma_agent1","TeomaTechnologies", +"titin","TitIn", +"titan","TITAN", +"tkwww","The TkWWW Robot", +"tlspider","TLSpider", +"ucsd","UCSD Crawl", +"udmsearch","UdmSearch", +"urlck","URL Check", +"valkyrie","Valkyrie", +"victoria","Victoria", +"visionsearch","vision-search", +"voyager","Voyager", +"vwbot","VWbot", +"w3index","The NWI Robot", +"w3m2","W3M2", +"wallpaper","WallPaper", +"wanderer","the World Wide Web Wanderer", +"wapspider","w@pSpider by wap4.com", +"webbandit","WebBandit Web Spider", +"webcatcher","WebCatcher", +"webcopy","WebCopy", +"webfetcher","webfetcher", +"webfoot","The Webfoot Robot", +"weblayers","weblayers", +"weblinker","WebLinker", +"webmirror","WebMirror", +"webmoose","The Web Moose", +"webquest","WebQuest", +"webreader","Digimarc MarcSpider", +"webreaper","WebReaper", +"websnarf","Websnarf", +"webspider","WebSpider", +"webvac","WebVac", +"webwalk","webwalk", +"webwalker","WebWalker", +"webwatch","WebWatch", +"wget","Wget", +"whatuseek","whatUseek Winona", +"whowhere","WhoWhere Robot", +"wmir","w3mir", +"wolp","WebStolperer", +"wombat","The Web Wombat", +"worm","The World Wide Web Worm", +"wwwc","WWWC Ver 0.2.5", +"wz101","WebZinger", +"xget","XGET", +"nederland.zoek","Nederland.zoek", +# Other robots reported by users +"antibot", "Antibot", +"boris", "Boris", +"cscrawler","CsCrawler", +"daviesbot", "DaviesBot", +"digout4u", "digout4u", +"echo", "EchO!", +"ezresult", "Ezresult", +"fast-webcrawler", "Fast-Webcrawler", +"gnodspider","GNOD Spider", +"ia_archiver", "ia_archiver", +"jennybot", "JennyBot", +"justview", "JustView", +"mercator", "Mercator", +"perman", "Perman surfer", +"peternews", "Peternews", +"redalert", "Red Alert", +"shoutcast","Shoutcast Directory Service", +"ultraseek", "Ultraseek", +"unlost_web_crawler", "Unlost_Web_Crawler", +"voila", "Voila", +"webbase", "WebBase", +"webcompass", "webcompass", +"wisenutbot","WISENutbot", +"yandex", "Yandex bot", +# Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html +# Generic root ID +"robot", "Unknown robot" ); + 1;