From: eldy <> Date: Tue, 8 Oct 2002 04:00:52 +0000 (+0000) Subject: Updated database X-Git-Tag: AWSTATS_5_1_BETA~52 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f62b3c5c3371a167dbdb88ea1dd9a40471fa3260;p=thirdparty%2FAWStats.git Updated database --- diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index 245702dc..42efe11d 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -306,11 +306,13 @@ "xget", # Other robots reported by users "awbot", +"baiduspider", "bobby", "boris", "bumblebee", "cscrawler", "daviesbot", +"exactseek", "ezresult", "gigabot", "gnodspider", @@ -318,6 +320,7 @@ "justview", "linkbot", "linkchecker", +"msiecrawler", "nederland.zoek", "perman", "pompos", @@ -325,9 +328,11 @@ "redalert", "shoutcast", "slysearch", +"turnitinbot", "ultraseek", "webcompass", -"yandex" +"yandex", +"zyborg" ); @RobotsSearchIDOrder_list3 = ( # Generic robot @@ -415,7 +420,7 @@ "getbot","GetBot", "geturl","GetURL", "golem","Golem", -"googlebot","Googlebot", +"googlebot","Googlebot (Google)", "grapnel","Grapnel/0.01 Experiment", "griffon","Griffon", "gromit","Gromit", @@ -522,7 +527,7 @@ "roverbot","Roverbot", "rules","RuLeS", "safetynetrobot","SafetyNet Robot", -"scooter","Scooter", +"scooter","Scooter (AltaVista)", "search_au","Search.Aus-AU.COM", "searchprocess","SearchProcess", "senrigan","Senrigan", @@ -609,6 +614,7 @@ # Other robots reported by users "antibot", "Antibot", "awbot", "AWBot", +"baiduspider","BaiDuSpider", "bobby", "Bobby", "boris", "Boris", "bumblebee", "Bumblebee (relevare.com)", @@ -616,17 +622,19 @@ "daviesbot", "DaviesBot", "digout4u", "Digout4u", "echo", "EchO!", +"exactseek","ExactSeek Crawler", "ezresult", "Ezresult", -"fast-webcrawler", "Fast-Webcrawler", +"fast-webcrawler", "Fast-Webcrawler (AllTheWeb)", "gigabot","GigaBot", "gnodspider","GNOD Spider", -"ia_archiver", "IA Archiver", +"ia_archiver", "Alexa (IA Archiver)", "internetseer", "InternetSeer", "jennybot", "JennyBot", "justview", "JustView", "linkbot","LinkBot", "linkchecker","LinkChecker", "mercator", "Mercator", +"msiecrawler","MSIECrawler", "perman", "Perman surfer", "petersnews", "Petersnews", "pompos","Pompos", @@ -634,13 +642,15 @@ "redalert", "Red Alert", "shoutcast","Shoutcast Directory Service", "slysearch","SlySearch", +"turnitinbot","Turn It In", "ultraseek", "Ultraseek", -"unlost_web_crawler", "Unlost_Web_Crawler", +"unlost_web_crawler", "Unlost Web Crawler", "voila", "Voila", "webbase", "WebBase", "webcompass", "webcompass", -"wisenutbot","WISENutbot", +"wisenutbot","WISENutbot (Looksmart)", "yandex", "Yandex bot", +"zyborg","Zyborg (Looksmart)", # Generic root ID "robot", "Unknown robot (identified by 'robot')", diff --git a/wwwroot/cgi-bin/lib/search_engines.pm b/wwwroot/cgi-bin/lib/search_engines.pm index 8b8b4821..a64e749e 100644 --- a/wwwroot/cgi-bin/lib/search_engines.pm +++ b/wwwroot/cgi-bin/lib/search_engines.pm @@ -17,7 +17,7 @@ #----------------------------------------------------------------- @SearchEnginesSearchIDOrder=( # Major internationnal search engines -"google\.", +"google\.", # TODO Add 216\.239\.35\.101|216\.239\.37\.101|216\.239\.39\.100|216\.239\.39\.101|216\.239\.51\.100|216\.239\.51\.101|216\.239\.35\.100 "msn\.", "voila\.", "yahoo\.", @@ -27,9 +27,9 @@ "alltheweb\.com", "netscape\.", "dmoz\.org", -"search\.aol\.co", "www\.search\.com", -"overture\.com", # Replace "goto\.com","Goto.com", +"tiscali\.", +"search\.aol\.co", # Minor internationnal search engines "northernlight\.", "hotbot\.", @@ -40,12 +40,15 @@ "(^|\.)go\.com", "euroseek\.", "excite\.", -"lokace\.", +"looksmart\.", "spray\.", -"netfind\.aol\.com", -"recherche\.aol\.fr", "nbci\.com/search", "askjeeves\.", +"atomz\.", +"overture\.com", # Replace "goto\.com","Goto.com", +"teoma\.", +"findarticles\.com", +"infospace\.com", "mamma\.", "dejanews\.", "search\.dogpile\.com", @@ -61,7 +64,7 @@ # Minor finnish search engines "haku\.www\.fi", # Minor french search engines -"nomade\.fr/","ctrouve\.","francite\.","\.lbb\.org","rechercher\.libertysurf\.fr", +"recherche\.aol\.fr","ctrouve\.","francite\.","\.lbb\.org","rechercher\.libertysurf\.fr", # Minor german search engines "fireball\.de","infoseek\.de","suche\.web\.de","meta\.ger", # Minor hungarian search engines @@ -79,48 +82,126 @@ ); +# SearchEnginesKnownUrl +# Search engines known rules to extract keywords from a referrer URL +#------------------------------------------------- +%SearchEnginesKnownUrl=( +# Most common search engines +"alltheweb\.com","q(|uery)=", +"altavista\.","q=", +"dmoz\.org","search=", +"google\.","(p|q)=", +"lycos\.","query=", +"msn\.","q=", +"netscape\.","search=", +"search\.aol\.co","query=", +"search\.terra\.","query=", +"voila\.","kw=", +"www\.search\.com","q=", +"yahoo\.","p=", +# Minor internationnal search engines +"(^|\.)go\.com","qt=", +"askjeeves\.","ask=", +"atomz\.","sp-q=", +"euroseek\.","query=", +"excite\.","search=", +"findarticles\.com","key=", +"go2net\.com","general=", +"hotbot\.","mt=", +"infospace\.com","qkw=", +"kvasir\.", "q=", +"looksmart\.","key=", +"mamma\.","query=", +"metacrawler\.","general=", +"nbci\.com/search","keyword=", +"northernlight\.","qr=", +"overture\.com","keywords=", +"search\.dogpile\.com", "q=", +"spray\.","string=", +"teoma\.","q=", +"virgilio\.it","qs=", +"webcrawler","searchText=", +"wisenut\.com","query=", +# Minor brazilian search engines +"engine\.exe","p1=", "miner\.bol\.com\.br","q=", +# Minor danish search engines +"opasia\.dk","q=", "danielsen\.com","q=", +# Minor dutch search engines +"ilse\.","search_for=", "vindex\.","in=", +# Minor english search engines +"splut\.","pattern=", "ukplus\.", "search=", "mirago\.", "txtsearch=", +# Minor english search engines +"ukindex\.co\.uk", "stext=", "ukdirectory\.","k=", +# Minor finnish search engines +"haku\.www\.fi","w=", +# Minor french search engines +"nomade\.fr/","s=", "francite\.","name=", +# Minor german search engines +"fireball\.de","q=", "infoseek\.de","qt=", "suche\.web\.de","su=", +# Minor hungarian search engines +"heureka\.hu","heureka=", "vizsla\.origo\.hu/katalogus?","q=", "vizsla\.origo\.hu","search=", "lapkereso\.hu","keres.php", "goliat\.hu","KERESES=", "index\.hu","search.php3", "wahoo\.hu","q=", "freeweb\.hu","KERESES=", "search\.internetto\.hu","searchstr=", +# Minor norvegian search engines +"sok\.start\.no", "q=", +# Minor swedish search engines +"evreka\.passagen\.se","q=", +# Minor czech search engines +"atlas\.cz","searchtext=", "seznam\.cz","w=", "ftxt\.quick\.cz","query=", "centrum\.cz","q=", "najdi\.to","dotaz=", "redbox.cz","srch=" +); + + +# If no rules are known, this will be used to search keyword parameter +@WordsToExtractSearchUrl= ("ask=","claus=","general=","key=","kw=","keyword=","keywords=","MT=","p=","q=","qr=","qt=","query=","s=","search=","searchText=","string=","su=","txtsearch=","w="); + +# If no rules are known and search in WordsToExtractSearchUrl failed, this will be used to clean URL of not keyword parameters. +@WordsToCleanSearchUrl= ("act=","annuaire=","btng=","categoria=","cfg=","cof=","cou=","count=","cp=","dd=","domain=","dt=","dw=","enc=","exec=","geo=","hc=","height=","hits=","hl=","hq=","hs=","id=","kl=","lang=","loc=","lr=","matchmode=","medor=","message=","meta=","mode=","order=","page=","par=","pays=","pg=","pos=","prg=","qc=","refer=","sa=","safe=","sc=","sort=","src=","start=","style=","stype=","sum=","tag=","temp=","theme=","type=","url=","user=","width=","what=","\\.x=","\\.y=","y=","look="); + + + # SearchEnginesHashIDLib # List of search engines names # "match_string_in_url_that_identify_engine", "search_engine_name", #----------------------------------------------------------------- %SearchEnginesHashIDLib=( # Major internationnal search engines +"alltheweb\.com","AllTheWeb", +"altavista\.","AltaVista", +"dmoz\.org","DMOZ", "google\.","Google", -"msn\.","MSN", -"voila\.", "Voila", -"yahoo\.","Yahoo", "lycos\.","Lycos", -"altavista\.","AltaVista", -"search\.terra\.","Terra", -"alltheweb\.com","AllTheWeb", +"msn\.","MSN", "netscape\.","Netscape", -"dmoz\.org","DMOZ", "search\.aol\.co","AOL", +"search\.terra\.","Terra", +"tiscali\.","Tiscali", +"voila\.", "Voila", "www\.search\.com","Search.com", -"overture\.com","Overture", # Replace "goto\.com","Goto.com", +"yahoo\.","Yahoo", # Minor internationnal search engines -"northernlight\.","NorthernLight", -"hotbot\.","Hotbot", -"kvasir\.","Kvasir", -"webcrawler\.","WebCrawler", -"metacrawler\.","MetaCrawler (Metamoteur)", -"go2net\.com","Go2Net (Metamoteur)", "(^|\.)go\.com","Go.com", +"askjeeves\.","Ask Jeeves", +"atomz\.","Atomz", +"dejanews\.","DejaNews", "euroseek\.","Euroseek", "excite\.","Excite", -"lokace\.", "Lokace", -"spray\.","Spray", -"netfind\.aol\.com","AOL", -"recherche\.aol\.fr","AOL", -"nbci\.com/search","NBCI", -"askjeeves\.","Ask Jeeves", +"findarticles\.com","Find Articles", +"go2net\.com","Go2Net (Metamoteur)", +"hotbot\.","Hotbot", +"infospace\.com","InfoSpace", +"kvasir\.","Kvasir", +"looksmart\.","Looksmart", "mamma\.","Mamma", -"dejanews\.","DejaNews", +"metacrawler\.","MetaCrawler (Metamoteur)", +"nbci\.com/search","NBCI", +"northernlight\.","NorthernLight", +"overture\.com","Overture", # Replace "goto\.com","Goto.com", "search\.dogpile\.com","Dogpile", +"spray\.","Spray", +"teoma\.","Teoma", # Replace "directhit\.com","DirectHit", +"webcrawler\.","WebCrawler", "wisenut\.com","WISENut", # Minor brazilian search engines "engine\.exe","Cade", "miner\.bol\.com\.br","Meta Miner", -# Minor danish search-engines +# Minor danish search-engines "opasia\.dk","Opasia", "danielsen\.com","Thor (danielsen.com)", # Minor dutch search engines "ilse\.","Ilse","vindex\.","Vindex\.nl", @@ -129,7 +210,7 @@ # Minor finnish search engines "haku\.www\.fi","Ihmemaa", # Minor french search engines -"nomade\.fr/","Nomade", "ctrouve\.","C'est trouvé", "francite\.","Francité", "\.lbb\.org", "LBB", "rechercher\.libertysurf\.fr", "Libertysurf", +"recherche\.aol\.fr","AOL", "ctrouve\.","C'est trouvé", "francite\.","Francité", "\.lbb\.org", "LBB", "rechercher\.libertysurf\.fr", "Libertysurf", # Minor german search engines "fireball\.de","Fireball", "infoseek\.de","Infoseek", "suche\.web\.de","Web.de", "meta\.ger","MetaGer", # Minor hungarian search engines @@ -147,58 +228,4 @@ ); -# SearchEnginesKnownUrl -# Search engines known rules to extract keywords from a referrer URL -#------------------------------------------------- -%SearchEnginesKnownUrl=( -# Most common search engines -"yahoo\.","p=", -"altavista\.","q=", -"msn\.","q=", -"voila\.","kw=", -"lycos\.","query=", -"search\.terra\.","query=", -"google\.","(p|q)=", -"alltheweb\.com","q(|uery)=", -"netscape\.","search=", -"northernlight\.","qr=", -"dmoz\.org","search=", -"search\.aol\.co","query=", -"www\.search\.com","q=", -"kvasir\.", "q=", -# Others -"askjeeves\.","ask=", -"hotbot\.","mt=", -"metacrawler\.","general=", -"go2net\.com","general=", -"(^|\.)go\.com","qt=", -"overture\.com","keywords=", -"euroseek\.","query=", -"excite\.","search=", -"spray\.","string=", -"nbci\.com/search","keyword=", -"mamma\.","query=", -"search\.dogpile\.com", "q=", -"wisenut\.com","query=", -"virgilio\.it","qs=", -"webcrawler","searchText=", -"engine\.exe","p1=", "miner\.bol\.com\.br","q=", # Minor brazilian search engines -"opasia\.dk","q=", "danielsen\.com","q=", # Minor danish search engines -"ilse\.","search_for=", "vindex\.","in=", # Minor dutch search engines -"splut\.","pattern=", "ukplus\.", "search=", "mirago\.", "txtSearch=", # Minor english search engines -"ukindex\.co\.uk", "stext=", "ukdirectory\.","k=", # Minor english search engines -"haku\.www\.fi","w=", # Minor finnish search engines -"nomade\.fr/","s=", "francite\.","name=", # Minor french search engines -"fireball\.de","q=", "infoseek\.de","qt=", "suche\.web\.de","su=", # Minor german search engines -"heureka\.hu","heureka=", "vizsla\.origo\.hu/katalogus?","q=", "vizsla\.origo\.hu","search=", "lapkereso\.hu","keres.php", "goliat\.hu","KERESES=", "index\.hu","search.php3", "wahoo\.hu","q=", "freeweb\.hu","KERESES=", "search\.internetto\.hu","searchstr=", #Minor hungarian search engines -"sok\.start\.no", "q=", # Minor norvegian search engines -"evreka\.passagen\.se","q=", # Minor swedish search engines -"atlas\.cz","searchtext=", "seznam\.cz","w=", "ftxt\.quick\.cz","query=", "centrum\.cz","q=", "najdi\.to","dotaz=", "redbox.cz","srch=" # Minor czech search engines -); - -# If no rules are known, this will be used to clean URL of not keyword parameters. -@WordsToCleanSearchUrl= ("act=","annuaire=","btng=","categoria=","cfg=","cof=","cou=","cp=","dd=","domain=","dt=","dw=","enc=","exec=","geo=","hc=","height=","hl=","hq=","hs=","id=","kl=","lang=","loc=","lr=","matchmode=","medor=","message=","meta=","mode=","order=","page=","par=","pays=","pg=","pos=","prg=","qc=","refer=","sa=","safe=","sc=","sort=","src=","start=","style=","stype=","sum=","tag=","temp=","theme=","url=","user=","width=","what=","\\.x=","\\.y=","y=","look="); -# Never put the following exclusion ("ask=","claus=","general=","kw=","keyword=","keywords=","MT","p=","q=","qr=","qt=","query=","s=","search=","searchText=","string=","su=","w=") because they are strings that contain keywords we're looking for. - - 1;