]> git.ipfire.org Git - thirdparty/AWStats.git/commitdiff
Updated database
authoreldy <>
Tue, 8 Oct 2002 04:00:52 +0000 (04:00 +0000)
committereldy <>
Tue, 8 Oct 2002 04:00:52 +0000 (04:00 +0000)
wwwroot/cgi-bin/lib/robots.pm
wwwroot/cgi-bin/lib/search_engines.pm

index 245702dc5927a182866f083ca02f1bc2d7710cc9..42efe11db867c5ddc8489eca583f8e872157354c 100644 (file)
 "xget",
 # Other robots reported by users
 "awbot",
+"baiduspider",
 "bobby",
 "boris",
 "bumblebee",
 "cscrawler",
 "daviesbot",
+"exactseek",
 "ezresult",
 "gigabot",
 "gnodspider",
 "justview",
 "linkbot",
 "linkchecker",
+"msiecrawler",
 "nederland.zoek",
 "perman",
 "pompos",
 "redalert",
 "shoutcast",
 "slysearch",
+"turnitinbot",
 "ultraseek",
 "webcompass",
-"yandex"
+"yandex",
+"zyborg"
 );
 @RobotsSearchIDOrder_list3 = (
 # Generic robot
 "getbot","GetBot",
 "geturl","GetURL",
 "golem","Golem",
-"googlebot","Googlebot",
+"googlebot","Googlebot (Google)",
 "grapnel","Grapnel/0.01 Experiment",
 "griffon","Griffon",
 "gromit","Gromit",
 "roverbot","Roverbot",
 "rules","RuLeS",
 "safetynetrobot","SafetyNet Robot",
-"scooter","Scooter",
+"scooter","Scooter (AltaVista)",
 "search_au","Search.Aus-AU.COM",
 "searchprocess","SearchProcess",
 "senrigan","Senrigan",
 # Other robots reported by users
 "antibot", "Antibot",
 "awbot", "AWBot",
+"baiduspider","BaiDuSpider",
 "bobby", "Bobby", 
 "boris", "Boris",
 "bumblebee", "Bumblebee (relevare.com)",
 "daviesbot", "DaviesBot",
 "digout4u", "Digout4u",
 "echo", "EchO!",
+"exactseek","ExactSeek Crawler",
 "ezresult",    "Ezresult",
-"fast-webcrawler", "Fast-Webcrawler",
+"fast-webcrawler", "Fast-Webcrawler (AllTheWeb)",
 "gigabot","GigaBot",
 "gnodspider","GNOD Spider",
-"ia_archiver", "IA Archiver",
+"ia_archiver", "Alexa (IA Archiver)",
 "internetseer", "InternetSeer",
 "jennybot", "JennyBot",
 "justview", "JustView",
 "linkbot","LinkBot",
 "linkchecker","LinkChecker",
 "mercator", "Mercator",
+"msiecrawler","MSIECrawler",
 "perman", "Perman surfer",
 "petersnews", "Petersnews",
 "pompos","Pompos",
 "redalert", "Red Alert",
 "shoutcast","Shoutcast Directory Service",
 "slysearch","SlySearch",
+"turnitinbot","Turn It In",
 "ultraseek", "Ultraseek",
-"unlost_web_crawler", "Unlost_Web_Crawler",
+"unlost_web_crawler", "Unlost Web Crawler",
 "voila", "Voila",
 "webbase", "WebBase",
 "webcompass", "webcompass",
-"wisenutbot","WISENutbot",
+"wisenutbot","WISENutbot (Looksmart)",
 "yandex", "Yandex bot",
+"zyborg","Zyborg (Looksmart)",
 
 # Generic root ID
 "robot", "Unknown robot (identified by 'robot')",
index 8b8b4821ff5f1b327b1b1d228f96f65eb9ddea22..a64e749ea486db7175b5eef8221cdf4fec1fe4d9 100644 (file)
@@ -17,7 +17,7 @@
 #-----------------------------------------------------------------
 @SearchEnginesSearchIDOrder=(
 # Major internationnal search engines
-"google\.",
+"google\.",    # TODO Add 216\.239\.35\.101|216\.239\.37\.101|216\.239\.39\.100|216\.239\.39\.101|216\.239\.51\.100|216\.239\.51\.101|216\.239\.35\.100
 "msn\.",
 "voila\.",
 "yahoo\.",
@@ -27,9 +27,9 @@
 "alltheweb\.com",
 "netscape\.",
 "dmoz\.org",
-"search\.aol\.co",
 "www\.search\.com",
-"overture\.com",               # Replace "goto\.com","Goto.com",
+"tiscali\.",
+"search\.aol\.co",
 # Minor internationnal search engines
 "northernlight\.",
 "hotbot\.",
 "(^|\.)go\.com",
 "euroseek\.",
 "excite\.",
-"lokace\.",
+"looksmart\.",
 "spray\.",
-"netfind\.aol\.com",
-"recherche\.aol\.fr",
 "nbci\.com/search",
 "askjeeves\.",
+"atomz\.",
+"overture\.com",               # Replace "goto\.com","Goto.com",
+"teoma\.",
+"findarticles\.com",
+"infospace\.com",
 "mamma\.",
 "dejanews\.",
 "search\.dogpile\.com",
@@ -61,7 +64,7 @@
 # Minor finnish search engines
 "haku\.www\.fi",
 # Minor french search engines
-"nomade\.fr/","ctrouve\.","francite\.","\.lbb\.org","rechercher\.libertysurf\.fr",
+"recherche\.aol\.fr","ctrouve\.","francite\.","\.lbb\.org","rechercher\.libertysurf\.fr",
 # Minor german search engines
 "fireball\.de","infoseek\.de","suche\.web\.de","meta\.ger",
 # Minor hungarian search engines
 );
 
 
+# SearchEnginesKnownUrl
+# Search engines known rules to extract keywords from a referrer URL
+#-------------------------------------------------
+%SearchEnginesKnownUrl=(
+# Most common search engines
+"alltheweb\.com","q(|uery)=",
+"altavista\.","q=",
+"dmoz\.org","search=",
+"google\.","(p|q)=",
+"lycos\.","query=",
+"msn\.","q=",
+"netscape\.","search=",
+"search\.aol\.co","query=",
+"search\.terra\.","query=",
+"voila\.","kw=",
+"www\.search\.com","q=",
+"yahoo\.","p=",
+# Minor internationnal search engines
+"(^|\.)go\.com","qt=",
+"askjeeves\.","ask=",
+"atomz\.","sp-q=",
+"euroseek\.","query=",
+"excite\.","search=",
+"findarticles\.com","key=",
+"go2net\.com","general=",
+"hotbot\.","mt=",
+"infospace\.com","qkw=",
+"kvasir\.", "q=",
+"looksmart\.","key=",
+"mamma\.","query=",
+"metacrawler\.","general=",
+"nbci\.com/search","keyword=",
+"northernlight\.","qr=",
+"overture\.com","keywords=",
+"search\.dogpile\.com", "q=",
+"spray\.","string=",
+"teoma\.","q=",
+"virgilio\.it","qs=",
+"webcrawler","searchText=",
+"wisenut\.com","query=",
+# Minor brazilian search engines
+"engine\.exe","p1=", "miner\.bol\.com\.br","q=",
+# Minor danish search engines
+"opasia\.dk","q=", "danielsen\.com","q=",
+# Minor dutch search engines
+"ilse\.","search_for=", "vindex\.","in=",
+# Minor english search engines
+"splut\.","pattern=", "ukplus\.", "search=", "mirago\.", "txtsearch=",
+# Minor english search engines
+"ukindex\.co\.uk", "stext=", "ukdirectory\.","k=",
+# Minor finnish search engines
+"haku\.www\.fi","w=",
+# Minor french search engines
+"nomade\.fr/","s=", "francite\.","name=",
+# Minor german search engines
+"fireball\.de","q=", "infoseek\.de","qt=", "suche\.web\.de","su=",
+# Minor hungarian search engines
+"heureka\.hu","heureka=", "vizsla\.origo\.hu/katalogus?","q=", "vizsla\.origo\.hu","search=", "lapkereso\.hu","keres.php", "goliat\.hu","KERESES=", "index\.hu","search.php3", "wahoo\.hu","q=", "freeweb\.hu","KERESES=", "search\.internetto\.hu","searchstr=",
+# Minor norvegian search engines
+"sok\.start\.no", "q=",
+# Minor swedish search engines
+"evreka\.passagen\.se","q=",
+# Minor czech search engines
+"atlas\.cz","searchtext=", "seznam\.cz","w=", "ftxt\.quick\.cz","query=", "centrum\.cz","q=", "najdi\.to","dotaz=", "redbox.cz","srch="
+);
+
+
+# If no rules are known, this will be used to search keyword parameter
+@WordsToExtractSearchUrl= ("ask=","claus=","general=","key=","kw=","keyword=","keywords=","MT=","p=","q=","qr=","qt=","query=","s=","search=","searchText=","string=","su=","txtsearch=","w=");
+
+# If no rules are known and search in WordsToExtractSearchUrl failed, this will be used to clean URL of not keyword parameters.
+@WordsToCleanSearchUrl= ("act=","annuaire=","btng=","categoria=","cfg=","cof=","cou=","count=","cp=","dd=","domain=","dt=","dw=","enc=","exec=","geo=","hc=","height=","hits=","hl=","hq=","hs=","id=","kl=","lang=","loc=","lr=","matchmode=","medor=","message=","meta=","mode=","order=","page=","par=","pays=","pg=","pos=","prg=","qc=","refer=","sa=","safe=","sc=","sort=","src=","start=","style=","stype=","sum=","tag=","temp=","theme=","type=","url=","user=","width=","what=","\\.x=","\\.y=","y=","look=");
+
+
+
 # SearchEnginesHashIDLib
 # List of search engines names
 # "match_string_in_url_that_identify_engine", "search_engine_name",
 #-----------------------------------------------------------------
 %SearchEnginesHashIDLib=(
 # Major internationnal search engines
+"alltheweb\.com","AllTheWeb",
+"altavista\.","AltaVista",
+"dmoz\.org","DMOZ",
 "google\.","Google",
-"msn\.","MSN",
-"voila\.", "Voila",
-"yahoo\.","Yahoo",
 "lycos\.","Lycos",
-"altavista\.","AltaVista",
-"search\.terra\.","Terra",
-"alltheweb\.com","AllTheWeb",
+"msn\.","MSN",
 "netscape\.","Netscape",
-"dmoz\.org","DMOZ",
 "search\.aol\.co","AOL",
+"search\.terra\.","Terra",
+"tiscali\.","Tiscali",
+"voila\.", "Voila",
 "www\.search\.com","Search.com",
-"overture\.com","Overture",            # Replace "goto\.com","Goto.com",
+"yahoo\.","Yahoo",
 # Minor internationnal search engines
-"northernlight\.","NorthernLight",
-"hotbot\.","Hotbot",
-"kvasir\.","Kvasir",
-"webcrawler\.","WebCrawler",
-"metacrawler\.","MetaCrawler (Metamoteur)",
-"go2net\.com","Go2Net (Metamoteur)",
 "(^|\.)go\.com","Go.com",
+"askjeeves\.","Ask Jeeves",
+"atomz\.","Atomz",
+"dejanews\.","DejaNews",
 "euroseek\.","Euroseek",
 "excite\.","Excite",
-"lokace\.", "Lokace",
-"spray\.","Spray",
-"netfind\.aol\.com","AOL",
-"recherche\.aol\.fr","AOL",
-"nbci\.com/search","NBCI",
-"askjeeves\.","Ask Jeeves",
+"findarticles\.com","Find Articles",
+"go2net\.com","Go2Net (Metamoteur)",
+"hotbot\.","Hotbot",
+"infospace\.com","InfoSpace",
+"kvasir\.","Kvasir",
+"looksmart\.","Looksmart",
 "mamma\.","Mamma",
-"dejanews\.","DejaNews",
+"metacrawler\.","MetaCrawler (Metamoteur)",
+"nbci\.com/search","NBCI",
+"northernlight\.","NorthernLight",
+"overture\.com","Overture",                                    # Replace "goto\.com","Goto.com",
 "search\.dogpile\.com","Dogpile",
+"spray\.","Spray",
+"teoma\.","Teoma",                                                     # Replace "directhit\.com","DirectHit",
+"webcrawler\.","WebCrawler",
 "wisenut\.com","WISENut",
 # Minor brazilian search engines
 "engine\.exe","Cade", "miner\.bol\.com\.br","Meta Miner",
-# Minor danish search-engines 
+# Minor danish search-engines
 "opasia\.dk","Opasia", "danielsen\.com","Thor (danielsen.com)",        
 # Minor dutch search engines
 "ilse\.","Ilse","vindex\.","Vindex\.nl",                                               
 # Minor finnish search engines
 "haku\.www\.fi","Ihmemaa",                                                                             
 # Minor french search engines
-"nomade\.fr/","Nomade", "ctrouve\.","C'est trouvé", "francite\.","Francité", "\.lbb\.org", "LBB", "rechercher\.libertysurf\.fr", "Libertysurf",        
+"recherche\.aol\.fr","AOL", "ctrouve\.","C'est trouvé", "francite\.","Francité", "\.lbb\.org", "LBB", "rechercher\.libertysurf\.fr", "Libertysurf",    
 # Minor german search engines
 "fireball\.de","Fireball", "infoseek\.de","Infoseek", "suche\.web\.de","Web.de", "meta\.ger","MetaGer",        
 # Minor hungarian search engines
 );
 
 
-# SearchEnginesKnownUrl
-# Search engines known rules to extract keywords from a referrer URL
-#-------------------------------------------------
-%SearchEnginesKnownUrl=(
-# Most common search engines
-"yahoo\.","p=",
-"altavista\.","q=",
-"msn\.","q=",
-"voila\.","kw=",
-"lycos\.","query=",
-"search\.terra\.","query=",
-"google\.","(p|q)=",
-"alltheweb\.com","q(|uery)=",
-"netscape\.","search=",
-"northernlight\.","qr=",
-"dmoz\.org","search=",
-"search\.aol\.co","query=",
-"www\.search\.com","q=",
-"kvasir\.", "q=",
-# Others
-"askjeeves\.","ask=",
-"hotbot\.","mt=",
-"metacrawler\.","general=",
-"go2net\.com","general=",
-"(^|\.)go\.com","qt=",
-"overture\.com","keywords=",
-"euroseek\.","query=",
-"excite\.","search=",
-"spray\.","string=",
-"nbci\.com/search","keyword=",
-"mamma\.","query=",
-"search\.dogpile\.com", "q=",
-"wisenut\.com","query=",
-"virgilio\.it","qs=",
-"webcrawler","searchText=",
-"engine\.exe","p1=", "miner\.bol\.com\.br","q=",                               # Minor brazilian search engines
-"opasia\.dk","q=", "danielsen\.com","q=",                                              # Minor danish search engines
-"ilse\.","search_for=", "vindex\.","in=",                                              # Minor dutch search engines
-"splut\.","pattern=", "ukplus\.", "search=", "mirago\.", "txtSearch=",         # Minor english search engines
-"ukindex\.co\.uk", "stext=", "ukdirectory\.","k=",                                                     # Minor english search engines
-"haku\.www\.fi","w=",                                                                                                          # Minor finnish search engines
-"nomade\.fr/","s=", "francite\.","name=",                                                                      # Minor french search engines
-"fireball\.de","q=", "infoseek\.de","qt=", "suche\.web\.de","su=",                     # Minor german search engines
-"heureka\.hu","heureka=", "vizsla\.origo\.hu/katalogus?","q=", "vizsla\.origo\.hu","search=", "lapkereso\.hu","keres.php", "goliat\.hu","KERESES=", "index\.hu","search.php3", "wahoo\.hu","q=", "freeweb\.hu","KERESES=", "search\.internetto\.hu","searchstr=",  #Minor hungarian search engines
-"sok\.start\.no", "q=",                                                                                        # Minor norvegian search engines
-"evreka\.passagen\.se","q=",                                                                           # Minor swedish search engines
-"atlas\.cz","searchtext=", "seznam\.cz","w=", "ftxt\.quick\.cz","query=", "centrum\.cz","q=", "najdi\.to","dotaz=", "redbox.cz","srch="                # Minor czech search engines
-);
-
-# If no rules are known, this will be used to clean URL of not keyword parameters.
-@WordsToCleanSearchUrl= ("act=","annuaire=","btng=","categoria=","cfg=","cof=","cou=","cp=","dd=","domain=","dt=","dw=","enc=","exec=","geo=","hc=","height=","hl=","hq=","hs=","id=","kl=","lang=","loc=","lr=","matchmode=","medor=","message=","meta=","mode=","order=","page=","par=","pays=","pg=","pos=","prg=","qc=","refer=","sa=","safe=","sc=","sort=","src=","start=","style=","stype=","sum=","tag=","temp=","theme=","url=","user=","width=","what=","\\.x=","\\.y=","y=","look=");
-# Never put the following exclusion ("ask=","claus=","general=","kw=","keyword=","keywords=","MT","p=","q=","qr=","qt=","query=","s=","search=","searchText=","string=","su=","w=") because they are strings that contain keywords we're looking for.
-
-
 1;