From: eldy <> Date: Tue, 7 May 2002 15:47:21 +0000 (+0000) Subject: Updated search engines database. X-Git-Tag: AWSTATS_4_1_BETA~79 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9835eb1e98e16c66a4bed337bfe80ab5bb661d4d;p=thirdparty%2FAWStats.git Updated search engines database. --- diff --git a/wwwroot/cgi-bin/awstats.pl b/wwwroot/cgi-bin/awstats.pl index dd4395fe..46c90af1 100644 --- a/wwwroot/cgi-bin/awstats.pl +++ b/wwwroot/cgi-bin/awstats.pl @@ -13,7 +13,7 @@ # use strict and use vars are commented to make AWStats working with old perl. use strict;no strict "refs"; -use vars qw(%DomainsHashIDLib @RobotsSearchIDOrder_list1 @RobotsSearchIDOrder_list2 @RobotsSearchIDOrder_list3 @BrowsersSearchIDOrder @OSSearchIDOrder @WordsToCleanSearchUrl %BrowsersHereAreGrabbers %BrowsersHashIcon %BrowsersHashIDLib %OSHashID %OSHashLib %RobotsHashIDLib %SearchEnginesHashIDLib %SearchEnginesKnownUrl %DomainsHashIDLib); +use vars qw(%DomainsHashIDLib @RobotsSearchIDOrder_list1 @RobotsSearchIDOrder_list2 @RobotsSearchIDOrder_list3 @BrowsersSearchIDOrder @OSSearchIDOrder @WordsToCleanSearchUrl %BrowsersHereAreGrabbers %BrowsersHashIcon %BrowsersHashIDLib %OSHashID %OSHashLib %RobotsHashIDLib @SearchEnginesSearchIDOrder %SearchEnginesHashIDLib %SearchEnginesKnownUrl %DomainsHashIDLib); #use warnings; # Must be used in test mode only. This reduce a little process speed #use diagnostics; # Must be used in test mode only. This reduce a lot of process speed use Socket; @@ -123,7 +123,7 @@ my @OnlyFiles = my @SkipDNSLookupFor = my @SkipFiles = my @SkipHosts = (); my @DOWIndex=(); my @RobotArrayList = my @RobotsSearchIDOrder = (); #my @RobotsSearchIDOrder_list1=(); my @RobotsSearchIDOrder_list2=(); my @RobotsSearchIDOrder_list3=(); -#my @BrowsersSearchIDOrder = my @OSSearchIDOrder = (); +#my @BrowsersSearchIDOrder = my @OSSearchIDOrder = my @SearchEnginesSearchIDOrder(); #my @WordsToCleanSearchUrl=(); my @_msiever_h = my @_nsver_h = (); my @_from_p = my @_from_h = (); @@ -787,6 +787,7 @@ sub Read_Ref_Data { # Sanity check. if (@OSSearchIDOrder != scalar keys %OSHashID) { error("Error: Not same number of records of OSSearchIDOrder (".(@OSSearchIDOrder)." entries) and OSHashID (".(scalar keys %OSHashID)." entries) in OS database. Check your file ".$FilePath{"operating_systems.pl"}); } if (@BrowsersSearchIDOrder != scalar keys %BrowsersHashIDLib) { error("Error: Not same number of records of BrowsersSearchIDOrder (".(@BrowsersSearchIDOrder)." entries) and BrowsersHashIDLib (".(scalar keys %BrowsersHashIDLib)." entries) in Browsers database. Check your file ".$FilePath{"browsers.pl"}); } + if (@SearchEnginesSearchIDOrder != scalar keys %SearchEnginesHashIDLib) { error("Error: Not same number of records of SearchEnginesSearchIDOrder (".(@SearchEnginesSearchIDOrder)." entries) and SearchEnginesHashIDLib (".(scalar keys %SearchEnginesHashIDLib)." entries) in Search Engines database. Check your file ".$FilePath{"search_engines.pl"}); } if ((@RobotsSearchIDOrder_list1+@RobotsSearchIDOrder_list2+@RobotsSearchIDOrder_list3) != scalar keys %RobotsHashIDLib) { error("Error: Not same number of records of RobotsSearchIDOrder_listx (total is ".(@RobotsSearchIDOrder_list1+@RobotsSearchIDOrder_list2+@RobotsSearchIDOrder_list3)." entries) and RobotsHashIDLib (".(scalar keys %RobotsHashIDLib)." entries) in Robots database. Check your file ".$FilePath{"robots.pl"}); } } @@ -3245,7 +3246,7 @@ if ($UpdateStats) { # Other ? if (!$found) { - foreach my $key (@BrowsersSearchIDOrder) { + foreach my $key (@BrowsersSearchIDOrder) { # Search ID in order of BrowsersSearchIDOrder if ($UserAgent =~ /$key/) { $_browser_h{$key}++; $found=1; @@ -3277,7 +3278,7 @@ if ($UpdateStats) { if (! $TmpHashOS{$UserAgent}) { my $found=0; # in OSHashID list ? - foreach my $key (@OSSearchIDOrder) { # Searchin ID in order of OSSearchIDOrder + foreach my $key (@OSSearchIDOrder) { # Search ID in order of OSSearchIDOrder if ($UserAgent =~ /$key/) { $_os_h{$OSHashID{$key}}++; $found=1; @@ -3310,7 +3311,7 @@ if ($UpdateStats) { if ($LevelForRefererAnalyze && $field[$pos_referer]) { # Direct ? - if ($field[$pos_referer] eq "-" || $field[$pos_referer] eq "bookmarks") { # "bookmarks" is sent by Netscape + if ($field[$pos_referer] eq "-" || $field[$pos_referer] eq "bookmarks") { # "bookmarks" is sent by Netscape, "-" by all others browsers if ($PageBool) { $_from_p[0]++; } $_from_h[0]++; $found=1; @@ -3347,19 +3348,17 @@ if ($UpdateStats) { if ($LevelForSearchEnginesDetection) { - # If made on each record -> -1700 rows/seconds (should be made on 10% of records only) - foreach my $key (keys %SearchEnginesHashIDLib) { - # This hit came from the search engine $key - if ($refererserver =~ /$key/i) { - if ($Debug) { debug("Server $refererserver is added to TmpHashRefererServer with value '$key'",2); } - $TmpHashRefererServer{$refererserver}="$key"; - $found=1; - last; + # If made on each record -> -1700 rows/seconds (should be made on 10% of records only) + foreach my $key (@SearchEnginesSearchIDOrder) { # Search ID in order of SearchEnginesSearchIDOrder + if ($refererserver =~ /$key/i) { + # This hit came from the search engine $key + if ($Debug) { debug("Server $refererserver is added to TmpHashRefererServer with value '$key'",2); } + $TmpHashRefererServer{$refererserver}="$key"; + $found=1; + last; + } } } - - } - } } diff --git a/wwwroot/cgi-bin/lib/browsers.pl b/wwwroot/cgi-bin/lib/browsers.pl index 5dc31f38..8c18592b 100644 --- a/wwwroot/cgi-bin/lib/browsers.pl +++ b/wwwroot/cgi-bin/lib/browsers.pl @@ -3,7 +3,7 @@ # If you want to add a Browser to extend AWStats database detection capabilities, # you must add an entry in BrowsersSearchIDOrder and in BrowsersHashIDLib. #------------------------------------------------------- -# Last change $Revision$ - $Author$ - $Date$ +# $Revision$ - $Author$ - $Date$ @@ -96,6 +96,92 @@ "libwww" # Must be at end because some browser have both "browser id" and "libwww" ); +# BrowsersHashIDLib +# List of browser's name ("browser id in lower case", "browser text") +#--------------------------------------------------------------- +%BrowsersHashIDLib = ( +# Common web browsers text (IE and Netscape must not be in this list) +"icab","iCab", +"go!zilla","Go!Zilla", +"konqueror","Konqueror", +"links","Links", +"lynx","Lynx", +"omniweb","OmniWeb", +"opera","Opera", +"wget","Wget", +"22acidownload","22AciDownload", +"aol\\-iweng","AOL-Iweng", +"amaya","Amaya", +"amigavoyager","AmigaVoyager", +"antfresco","ANT Fresco", +"bpftp","BPFTP", +"cyberdog","Cyberdog", +"dreamcast","Dreamcast", +"downloadagent","DownloadAgent", +"ecatch", "eCatch", +"emailsiphon","EmailSiphon", +"encompass","Encompass", +"friendlyspider","FriendlySpider", +"getright","GetRight", +"headdump","HeadDump", +"hotjava","Sun HotJava", +"ibrowse","IBrowse", +"intergo","InterGO", +"linemodebrowser","W3C Line Mode Browser", +"lotus-notes","Lotus Notes web client", +"macweb","MacWeb", +"ncsa_mosaic","NCSA Mosaic", +"netpositive","NetPositive", +"nutscrape", "Nutscrape", +"msfrontpageexpress","MS FrontPage Express", +"tzgeturl","TZGETURL", +"viking","Viking", +"webfetcher","WebFetcher", +"webexplorer","IBM-WebExplorer", +"webmirror","WebMirror", +"webvcr","WebVCR", +# Site grabbers +"teleport","TelePort Pro (site grabber)", +"webcapture","Acrobat (site grabber)", +"webcopier", "WebCopier (site grabber)", +# Music only browsers +"real","RealAudio or compatible (media player)", +"winamp","WinAmp (media player)", # Works for winampmpeg and winamp3httprdr +"windows-media-player","Windows Media Player (media player)", +"audion","Audion (media player)", +"freeamp","FreeAmp (media player)", +"itunes","Apple iTunes (media player)", +"jetaudio","JetAudio (media player)", +"mint_audio","Mint Audio (media player)", +"mpg123","mpg123 (media player)", +"nsplayer","NetShow Player (media player)", +"sonique","Sonique (media player)", +"uplayer","Ultra Player (media player)", +"xmms","XMMS (media player)", +"xaudio","Some XAudio Engine based MPEG player (media player)", +# PDA/Phonecell browsers +"mmef","Microsoft Mobile Explorer (PDA/Phone browser)", +"mspie","MS Pocket Internet Explorer (PDA/Phone browser)", +"up\.","UP.Browser (PDA/Phone browser)", # Works for UP.Browser and UP.Link +"wapalizer","WAPalizer (PDA/Phone browser)", +"wapsilon","WAPsilon (PDA/Phone browser)", +"webcollage","WebCollage (PDA/Phone browser)", +"alcatel","Alcatel Browser (PDA/Phone browser)", +"nokia","Nokia Browser (PDA/Phone browser)", +# Others (TV) +"webtv","WebTV browser", +# Other kind of browsers +"csscheck","WDG CSS Validator", +"w3m","w3m", +"w3c_css_validator","W3C CSS Validator", +"w3c_validator","W3C HTML Validator", +"wdg_validator","WDG HTML Validator", +"webzip","WebZIP", +"staroffice","StarOffice", +"libwww","LibWWW" +); + + # BrowsersHashAreGrabber # Put here an entry for each browser in BrowsersSearchIDOrder that are grabber # browsers. @@ -106,6 +192,7 @@ "webcopier","1",, ); + # BrowsersHashIcon # Each Browsers Search ID is associated to a string that is the name of icon # file for this OS. @@ -195,88 +282,4 @@ ); -# Browser name list ("browser id in lower case", "browser text") -#--------------------------------------------------------------- -%BrowsersHashIDLib = ( -# Common web browsers text (IE and Netscape must not be in this list) -"icab","iCab", -"go!zilla","Go!Zilla", -"konqueror","Konqueror", -"links","Links", -"lynx","Lynx", -"omniweb","OmniWeb", -"opera","Opera", -"wget","Wget", -"22acidownload","22AciDownload", -"aol\\-iweng","AOL-Iweng", -"amaya","Amaya", -"amigavoyager","AmigaVoyager", -"antfresco","ANT Fresco", -"bpftp","BPFTP", -"cyberdog","Cyberdog", -"dreamcast","Dreamcast", -"downloadagent","DownloadAgent", -"ecatch", "eCatch", -"emailsiphon","EmailSiphon", -"encompass","Encompass", -"friendlyspider","FriendlySpider", -"getright","GetRight", -"headdump","HeadDump", -"hotjava","Sun HotJava", -"ibrowse","IBrowse", -"intergo","InterGO", -"linemodebrowser","W3C Line Mode Browser", -"lotus-notes","Lotus Notes web client", -"macweb","MacWeb", -"ncsa_mosaic","NCSA Mosaic", -"netpositive","NetPositive", -"nutscrape", "Nutscrape", -"msfrontpageexpress","MS FrontPage Express", -"tzgeturl","TZGETURL", -"viking","Viking", -"webfetcher","WebFetcher", -"webexplorer","IBM-WebExplorer", -"webmirror","WebMirror", -"webvcr","WebVCR", -# Site grabbers -"teleport","TelePort Pro (site grabber)", -"webcapture","Acrobat (site grabber)", -"webcopier", "WebCopier (site grabber)", -# Music only browsers -"real","RealAudio or compatible (media player)", -"winamp","WinAmp (media player)", # Works for winampmpeg and winamp3httprdr -"windows-media-player","Windows Media Player (media player)", -"audion","Audion (media player)", -"freeamp","FreeAmp (media player)", -"itunes","Apple iTunes (media player)", -"jetaudio","JetAudio (media player)", -"mint_audio","Mint Audio (media player)", -"mpg123","mpg123 (media player)", -"nsplayer","NetShow Player (media player)", -"sonique","Sonique (media player)", -"uplayer","Ultra Player (media player)", -"xmms","XMMS (media player)", -"xaudio","Some XAudio Engine based MPEG player (media player)", -# PDA/Phonecell browsers -"mmef","Microsoft Mobile Explorer (PDA/Phone browser)", -"mspie","MS Pocket Internet Explorer (PDA/Phone browser)", -"up\.","UP.Browser (PDA/Phone browser)", # Works for UP.Browser and UP.Link -"wapalizer","WAPalizer (PDA/Phone browser)", -"wapsilon","WAPsilon (PDA/Phone browser)", -"webcollage","WebCollage (PDA/Phone browser)", -"alcatel","Alcatel Browser (PDA/Phone browser)", -"nokia","Nokia Browser (PDA/Phone browser)", -# Others (TV) -"webtv","WebTV browser", -# Other kind of browsers -"csscheck","WDG CSS Validator", -"w3m","w3m", -"w3c_css_validator","W3C CSS Validator", -"w3c_validator","W3C HTML Validator", -"wdg_validator","WDG HTML Validator", -"webzip","WebZIP", -"staroffice","StarOffice", -"libwww","LibWWW" -); - 1; diff --git a/wwwroot/cgi-bin/lib/search_engines.pl b/wwwroot/cgi-bin/lib/search_engines.pl index 8ac97b02..6c704f56 100644 --- a/wwwroot/cgi-bin/lib/search_engines.pl +++ b/wwwroot/cgi-bin/lib/search_engines.pl @@ -1,35 +1,106 @@ # AWSTATS SEARCH ENGINES DATABASE -#-------------------------------- -# Last change $Revision$ - $Author$ - $Date$ +#------------------------------------------------------- +# If you want to add a Search Engine to extend AWStats database detection capabilities, +# you must add an entry in SearchEnginesSearchIDOrder and in SearchEnginesHashIDLib. +# An entry if known in SearchEnginesKnownUrl is also welcome. +#------------------------------------------------------- +# $Revision$ - $Author$ - $Date$ -# Search engines names database -# To add a search engine, add a new line: +# SearchEnginesSearchIDOrder +# This list is used to know in which order to search Search Engines IDs (Most +# frequent one are first in this list to increase detect speed). +# Note: Browsers IDs are in lower case and ' ' and '+' are changed into '_' +#----------------------------------------------------------------- +@SearchEnginesSearchIDOrder=( +# Major internationnal search engines +"google\.", +"msn\.", +"voila\.", +"yahoo\.", +"lycos\.", +"altavista\.", +"search\.terra\.", +"alltheweb\.com", +"netscape\.", +"dmoz\.org", +"search\.aol\.co", +"www\.search\.com", +"overture\.com", # Replace "goto\.com","Goto.com", +# Minor internationnal search engines +"northernlight\.", +"hotbot\.", +"kvasir\.", +"webcrawler\.", +"metacrawler\.", +"go2net\.com", +"go\.com", +"euroseek\.", +"excite\.", +"lokace\.", +"spray\.", +"netfind\.aol\.com", +"recherche\.aol\.fr", +"nbci\.com/search", +"askjeeves\.", +"mamma\.", +"dejanews\.", +"search\.dogpile\.com", +"wisenut\.com", +# Minor brazilian search engines +"engine\.exe", "miner\.bol\.com\.br", +# Minor danish search-engines +"opasia\.dk", "danielsen\.com", +# Minor dutch search engines +"ilse\.","vindex\.", +# Minor english search engines +"splut\.","ukplus\.","mirago\.","ukindex\.co\.uk","ukdirectory\.", +# Minor finnish search engines +"haku\.www\.fi", +# Minor french search engines +"nomade\.fr/","ctrouve\.","francite\.","\.lbb\.org","rechercher\.libertysurf\.fr", +# Minor german search engines +"fireball\.de","infoseek\.de","suche\.web\.de","meta\.ger", +# Minor italian search engines +"virgilio\.it", +# Minor norvegian search engines +"sok\.start\.no", +# Minor swedish search engines +"evreka\.passagen\.se", +# Minor czech search engines +"atlas\.cz","seznam\.cz","quick\.cz","centrum\.cz","najdi\.to","redbox\.cz", +# Other +"search\..*com" +); + + +# SearchEnginesHashIDLib +# List of search engines names # "match_string_in_url_that_identify_engine", "search_engine_name", #----------------------------------------------------------------- %SearchEnginesHashIDLib=( -# Most common search engines -"yahoo\.","Yahoo", -"altavista\.","AltaVista", +# Major internationnal search engines +"google\.","Google", "msn\.","MSN", "voila\.", "Voila", +"yahoo\.","Yahoo", "lycos\.","Lycos", +"altavista\.","AltaVista", "search\.terra\.","Terra", -"google\.","Google", "alltheweb\.com","AllTheWeb", "netscape\.","Netscape", -"northernlight\.","NorthernLight", "dmoz\.org","DMOZ", "search\.aol\.co","AOL", "www\.search\.com","Search.com", -"kvasir\.","Kvasir", -# Others +"overture\.com","Overture", # Replace "goto\.com","Goto.com", +# Minor internationnal search engines +"northernlight\.","NorthernLight", "hotbot\.","Hotbot", +"kvasir\.","Kvasir", "webcrawler\.","WebCrawler", "metacrawler\.","MetaCrawler (Metamoteur)", "go2net\.com","Go2Net (Metamoteur)", "go\.com","Go.com", -"overture\.com","Overture", # Replace "goto\.com","Goto.com", "euroseek\.","Euroseek", "excite\.","Excite", "lokace\.", "Lokace", @@ -42,21 +113,35 @@ "dejanews\.","DejaNews", "search\.dogpile\.com","Dogpile", "wisenut\.com","WISENut", -"engine\.exe","Cade", "miner\.bol\.com\.br","Meta Miner", # Minor brazilian search engines -"opasia\.dk","Opasia", "danielsen\.com","Thor (danielsen.com)", # Minor danish search-engines -"ilse\.","Ilse","vindex\.","Vindex\.nl", # Minor dutch search engines -"splut\.","Splut", "ukplus\.", "UKPlus", "mirago\.", "Mirago", "ukindex\.co\.uk", "UKIndex", "ukdirectory\.","UK Directory", # Minor english search engines -"haku\.www\.fi","Ihmemaa", # Minor finnish search engines -"nomade\.fr/","Nomade", "ctrouve\.","C'est trouvé", "francite\.","Francité", "\.lbb\.org", "LBB", "rechercher\.libertysurf\.fr", "Libertysurf", # Minor french search engines -"fireball\.de","Fireball", "infoseek\.de","Infoseek", "suche\.web\.de","Web.de", "meta\.ger","MetaGer", # Minor german search engines -"virgilio\.it","Virgilio", # Minor italian search engines -"sok\.start\.no","start.no", # Minor norvegian search engines -"evreka\.passagen\.se","Evreka", # Minor swedish search engines -"atlas\.cz","Atlas.cz", "seznam\.cz","Seznam.cz", "quick\.cz","Quick.cz", "centrum\.cz","Centrum.cz", #Minor czech search engines +# Minor brazilian search engines +"engine\.exe","Cade", "miner\.bol\.com\.br","Meta Miner", +# Minor danish search-engines +"opasia\.dk","Opasia", "danielsen\.com","Thor (danielsen.com)", +# Minor dutch search engines +"ilse\.","Ilse","vindex\.","Vindex\.nl", +# Minor english search engines +"splut\.","Splut", "ukplus\.", "UKPlus", "mirago\.", "Mirago", "ukindex\.co\.uk", "UKIndex", "ukdirectory\.","UK Directory", +# Minor finnish search engines +"haku\.www\.fi","Ihmemaa", +# Minor french search engines +"nomade\.fr/","Nomade", "ctrouve\.","C'est trouvé", "francite\.","Francité", "\.lbb\.org", "LBB", "rechercher\.libertysurf\.fr", "Libertysurf", +# Minor german search engines +"fireball\.de","Fireball", "infoseek\.de","Infoseek", "suche\.web\.de","Web.de", "meta\.ger","MetaGer", +# Minor italian search engines +"virgilio\.it","Virgilio", +# Minor norvegian search engines +"sok\.start\.no","start.no", +# Minor swedish search engines +"evreka\.passagen\.se","Evreka", +# Minor czech search engines +"atlas\.cz","Atlas.cz", "seznam\.cz","Seznam.cz", "quick\.cz","Quick.cz", "centrum\.cz","Centrum.cz","najdi\.to","Najdi.to","redbox\.cz","RedBox.cz", +# Other "search\..*com","Other search engines" ); -# Search engines known URLs rules to find keywords + +# SearchEnginesKnownUrl +# Search engines known rules to extract keywords from a referrer URL #------------------------------------------------- %SearchEnginesKnownUrl=( # Most common search engines @@ -100,8 +185,9 @@ "fireball\.de","q=", "infoseek\.de","qt=", "suche\.web\.de","su=", # Minor german search engines "sok\.start\.no", "q=", # Minor norvegian search engines "evreka\.passagen\.se","q=", # Minor swedish search engines -"atlas\.cz","searchtext=", "seznam\.cz","w=", "ftxt\.quick\.cz","query=", "centrum\.cz","q=" # Minor czech search engines +"atlas\.cz","searchtext=", "seznam\.cz","w=", "ftxt\.quick\.cz","query=", "centrum\.cz","q=", "najdi\.to","dotaz=", "redbox.cz","srch=" # Minor czech search engines ); + # If no rules are known, this will be used to clean URL of not keyword parameters. @WordsToCleanSearchUrl= ("act=","annuaire=","btng=","categoria=","cfg=","cof=","cou=","cp=","dd=","domain=","dt=","dw=","exec=","geo=","hc=","height=","hl=","hq=","hs=","id=","kl=","lang=","loc=","lr=","matchmode=","medor=","message=","meta=","mode=","order=","page=","par=","pays=","pg=","pos=","prg=","qc=","refer=","sa=","safe=","sc=","sort=","src=","start=","style=","stype=","sum=","tag=","temp=","theme=","url=","user=","width=","what=","\\.x=","\\.y=","y=","look="); # Never put the following exclusion ("ask=","claus=","general=","kw=","keyword=","keywords=","MT","p=","q=","qr=","qt=","query=","s=","search=","searchText=","string=","su=","w=") because they are strings that contain keywords we're looking for.