]> git.ipfire.org Git - thirdparty/AWStats.git/commitdiff
Updated search engines database.
authoreldy <>
Tue, 7 May 2002 15:47:21 +0000 (15:47 +0000)
committereldy <>
Tue, 7 May 2002 15:47:21 +0000 (15:47 +0000)
wwwroot/cgi-bin/awstats.pl
wwwroot/cgi-bin/lib/browsers.pl
wwwroot/cgi-bin/lib/search_engines.pl

index dd4395fee45e94a6c780c6eca3f65c4b51043e67..46c90af134577e012d1fb8701ceb7b992068860a 100644 (file)
@@ -13,7 +13,7 @@
 
 # use strict and use vars are commented to make AWStats working with old perl.
 use strict;no strict "refs";
-use vars qw(%DomainsHashIDLib @RobotsSearchIDOrder_list1 @RobotsSearchIDOrder_list2 @RobotsSearchIDOrder_list3 @BrowsersSearchIDOrder @OSSearchIDOrder @WordsToCleanSearchUrl %BrowsersHereAreGrabbers %BrowsersHashIcon %BrowsersHashIDLib %OSHashID %OSHashLib %RobotsHashIDLib %SearchEnginesHashIDLib %SearchEnginesKnownUrl %DomainsHashIDLib);
+use vars qw(%DomainsHashIDLib @RobotsSearchIDOrder_list1 @RobotsSearchIDOrder_list2 @RobotsSearchIDOrder_list3 @BrowsersSearchIDOrder @OSSearchIDOrder @WordsToCleanSearchUrl %BrowsersHereAreGrabbers %BrowsersHashIcon %BrowsersHashIDLib %OSHashID %OSHashLib %RobotsHashIDLib @SearchEnginesSearchIDOrder %SearchEnginesHashIDLib %SearchEnginesKnownUrl %DomainsHashIDLib);
 #use warnings;         # Must be used in test mode only. This reduce a little process speed
 #use diagnostics;      # Must be used in test mode only. This reduce a lot of process speed
 use Socket;
@@ -123,7 +123,7 @@ my @OnlyFiles = my @SkipDNSLookupFor = my @SkipFiles = my @SkipHosts = ();
 my @DOWIndex=();
 my @RobotArrayList = my @RobotsSearchIDOrder = ();
 #my @RobotsSearchIDOrder_list1=(); my @RobotsSearchIDOrder_list2=();  my @RobotsSearchIDOrder_list3=();
-#my @BrowsersSearchIDOrder = my @OSSearchIDOrder = ();
+#my @BrowsersSearchIDOrder = my @OSSearchIDOrder = my @SearchEnginesSearchIDOrder();
 #my @WordsToCleanSearchUrl=();
 my @_msiever_h = my @_nsver_h = ();
 my @_from_p = my @_from_h = ();
@@ -787,6 +787,7 @@ sub Read_Ref_Data {
        # Sanity check.
        if (@OSSearchIDOrder != scalar keys %OSHashID) { error("Error: Not same number of records of OSSearchIDOrder (".(@OSSearchIDOrder)." entries) and OSHashID (".(scalar keys %OSHashID)." entries) in OS database. Check your file ".$FilePath{"operating_systems.pl"}); }
        if (@BrowsersSearchIDOrder != scalar keys %BrowsersHashIDLib) { error("Error: Not same number of records of BrowsersSearchIDOrder (".(@BrowsersSearchIDOrder)." entries) and BrowsersHashIDLib (".(scalar keys %BrowsersHashIDLib)." entries) in Browsers database. Check your file ".$FilePath{"browsers.pl"}); }
+       if (@SearchEnginesSearchIDOrder != scalar keys %SearchEnginesHashIDLib) { error("Error: Not same number of records of SearchEnginesSearchIDOrder (".(@SearchEnginesSearchIDOrder)." entries) and SearchEnginesHashIDLib (".(scalar keys %SearchEnginesHashIDLib)." entries) in Search Engines database. Check your file ".$FilePath{"search_engines.pl"}); }
        if ((@RobotsSearchIDOrder_list1+@RobotsSearchIDOrder_list2+@RobotsSearchIDOrder_list3) != scalar keys %RobotsHashIDLib) { error("Error: Not same number of records of RobotsSearchIDOrder_listx (total is ".(@RobotsSearchIDOrder_list1+@RobotsSearchIDOrder_list2+@RobotsSearchIDOrder_list3)." entries) and RobotsHashIDLib (".(scalar keys %RobotsHashIDLib)." entries) in Robots database. Check your file ".$FilePath{"robots.pl"}); }
 }
 
@@ -3245,7 +3246,7 @@ if ($UpdateStats) {
 
                                # Other ?
                                if (!$found) {
-                                       foreach my $key (@BrowsersSearchIDOrder) {
+                                       foreach my $key (@BrowsersSearchIDOrder) {      # Search ID in order of BrowsersSearchIDOrder
                                                if ($UserAgent =~ /$key/) {
                                                        $_browser_h{$key}++;
                                                        $found=1;
@@ -3277,7 +3278,7 @@ if ($UpdateStats) {
                        if (! $TmpHashOS{$UserAgent}) {
                                my $found=0;
                                # in OSHashID list ?
-                               foreach my $key (@OSSearchIDOrder) {    # Searchin ID in order of OSSearchIDOrder
+                               foreach my $key (@OSSearchIDOrder) {    # Search ID in order of OSSearchIDOrder
                                        if ($UserAgent =~ /$key/) {
                                                $_os_h{$OSHashID{$key}}++;
                                                $found=1;
@@ -3310,7 +3311,7 @@ if ($UpdateStats) {
                if ($LevelForRefererAnalyze && $field[$pos_referer]) {
 
                        # Direct ?
-                       if ($field[$pos_referer] eq "-" || $field[$pos_referer] eq "bookmarks") {       # "bookmarks" is sent by Netscape
+                       if ($field[$pos_referer] eq "-" || $field[$pos_referer] eq "bookmarks") {       # "bookmarks" is sent by Netscape, "-" by all others browsers
                                if ($PageBool) { $_from_p[0]++; }
                                $_from_h[0]++;
                                $found=1;
@@ -3347,19 +3348,17 @@ if ($UpdateStats) {
 
                                                        if ($LevelForSearchEnginesDetection) {
                                                                
-                                                       # If made on each record -> -1700 rows/seconds (should be made on 10% of records only)
-                                                       foreach my $key (keys %SearchEnginesHashIDLib) {
-                                                               # This hit came from the search engine $key
-                                                               if ($refererserver =~ /$key/i) {
-                                                                       if ($Debug) { debug("Server $refererserver is added to TmpHashRefererServer with value '$key'",2); }
-                                                                       $TmpHashRefererServer{$refererserver}="$key";
-                                                                       $found=1;
-                                                                       last;
+                                                               # If made on each record -> -1700 rows/seconds (should be made on 10% of records only)
+                                                               foreach my $key (@SearchEnginesSearchIDOrder) {         # Search ID in order of SearchEnginesSearchIDOrder
+                                                                       if ($refererserver =~ /$key/i) {
+                                                                               # This hit came from the search engine $key
+                                                                               if ($Debug) { debug("Server $refererserver is added to TmpHashRefererServer with value '$key'",2); }
+                                                                               $TmpHashRefererServer{$refererserver}="$key";
+                                                                               $found=1;
+                                                                               last;
+                                                                       }
                                                                }
                                                        }
-
-                                                       }
-                                                       
                                                }
                                        }
 
index 5dc31f3857cb8dbdfa3cbda015594114df96d466..8c18592bf2adf56e5e444135fef359a484818b53 100644 (file)
@@ -3,7 +3,7 @@
 # If you want to add a Browser to extend AWStats database detection capabilities,
 # you must add an entry in BrowsersSearchIDOrder and in BrowsersHashIDLib.
 #-------------------------------------------------------
-# Last change $Revision$ - $Author$ - $Date$
+# $Revision$ - $Author$ - $Date$
 
 
 
 "libwww"                               # Must be at end because some browser have both "browser id" and "libwww"
 );
 
+# BrowsersHashIDLib
+# List of browser's name ("browser id in lower case", "browser text")
+#---------------------------------------------------------------
+%BrowsersHashIDLib = (
+# Common web browsers text (IE and Netscape must not be in this list)
+"icab","iCab",
+"go!zilla","Go!Zilla",
+"konqueror","Konqueror",
+"links","Links",
+"lynx","Lynx",
+"omniweb","OmniWeb",
+"opera","Opera",
+"wget","Wget",
+"22acidownload","22AciDownload",
+"aol\\-iweng","AOL-Iweng",
+"amaya","Amaya",
+"amigavoyager","AmigaVoyager",
+"antfresco","ANT Fresco",
+"bpftp","BPFTP",
+"cyberdog","Cyberdog",
+"dreamcast","Dreamcast",
+"downloadagent","DownloadAgent",
+"ecatch", "eCatch",
+"emailsiphon","EmailSiphon",
+"encompass","Encompass",
+"friendlyspider","FriendlySpider",
+"getright","GetRight",
+"headdump","HeadDump",
+"hotjava","Sun HotJava",
+"ibrowse","IBrowse",
+"intergo","InterGO",
+"linemodebrowser","W3C Line Mode Browser",
+"lotus-notes","Lotus Notes web client",
+"macweb","MacWeb",
+"ncsa_mosaic","NCSA Mosaic",
+"netpositive","NetPositive",
+"nutscrape", "Nutscrape",
+"msfrontpageexpress","MS FrontPage Express",
+"tzgeturl","TZGETURL",
+"viking","Viking",
+"webfetcher","WebFetcher",
+"webexplorer","IBM-WebExplorer",
+"webmirror","WebMirror",
+"webvcr","WebVCR",
+# Site grabbers
+"teleport","TelePort Pro (site grabber)",
+"webcapture","Acrobat (site grabber)",
+"webcopier", "WebCopier (site grabber)",
+# Music only browsers
+"real","RealAudio or compatible (media player)",
+"winamp","WinAmp (media player)",                              # Works for winampmpeg and winamp3httprdr
+"windows-media-player","Windows Media Player (media player)",
+"audion","Audion (media player)",
+"freeamp","FreeAmp (media player)",
+"itunes","Apple iTunes (media player)",
+"jetaudio","JetAudio (media player)",
+"mint_audio","Mint Audio (media player)",
+"mpg123","mpg123 (media player)",
+"nsplayer","NetShow Player (media player)",
+"sonique","Sonique (media player)",
+"uplayer","Ultra Player (media player)",
+"xmms","XMMS (media player)",
+"xaudio","Some XAudio Engine based MPEG player (media player)",
+# PDA/Phonecell browsers
+"mmef","Microsoft Mobile Explorer (PDA/Phone browser)",
+"mspie","MS Pocket Internet Explorer (PDA/Phone browser)",
+"up\.","UP.Browser (PDA/Phone browser)",                                       # Works for UP.Browser and UP.Link
+"wapalizer","WAPalizer (PDA/Phone browser)",
+"wapsilon","WAPsilon (PDA/Phone browser)",
+"webcollage","WebCollage (PDA/Phone browser)",
+"alcatel","Alcatel Browser (PDA/Phone browser)",
+"nokia","Nokia Browser (PDA/Phone browser)",
+# Others (TV)
+"webtv","WebTV browser",
+# Other kind of browsers
+"csscheck","WDG CSS Validator",
+"w3m","w3m",
+"w3c_css_validator","W3C CSS Validator",
+"w3c_validator","W3C HTML Validator",
+"wdg_validator","WDG HTML Validator",
+"webzip","WebZIP",
+"staroffice","StarOffice",
+"libwww","LibWWW"
+);
+
+
 # BrowsersHashAreGrabber
 # Put here an entry for each browser in BrowsersSearchIDOrder that are grabber
 # browsers.
 "webcopier","1",,
 );
 
+
 # BrowsersHashIcon
 # Each Browsers Search ID is associated to a string that is the name of icon
 # file for this OS.
 );
 
 
-# Browser name list ("browser id in lower case", "browser text")
-#---------------------------------------------------------------
-%BrowsersHashIDLib = (
-# Common web browsers text (IE and Netscape must not be in this list)
-"icab","iCab",
-"go!zilla","Go!Zilla",
-"konqueror","Konqueror",
-"links","Links",
-"lynx","Lynx",
-"omniweb","OmniWeb",
-"opera","Opera",
-"wget","Wget",
-"22acidownload","22AciDownload",
-"aol\\-iweng","AOL-Iweng",
-"amaya","Amaya",
-"amigavoyager","AmigaVoyager",
-"antfresco","ANT Fresco",
-"bpftp","BPFTP",
-"cyberdog","Cyberdog",
-"dreamcast","Dreamcast",
-"downloadagent","DownloadAgent",
-"ecatch", "eCatch",
-"emailsiphon","EmailSiphon",
-"encompass","Encompass",
-"friendlyspider","FriendlySpider",
-"getright","GetRight",
-"headdump","HeadDump",
-"hotjava","Sun HotJava",
-"ibrowse","IBrowse",
-"intergo","InterGO",
-"linemodebrowser","W3C Line Mode Browser",
-"lotus-notes","Lotus Notes web client",
-"macweb","MacWeb",
-"ncsa_mosaic","NCSA Mosaic",
-"netpositive","NetPositive",
-"nutscrape", "Nutscrape",
-"msfrontpageexpress","MS FrontPage Express",
-"tzgeturl","TZGETURL",
-"viking","Viking",
-"webfetcher","WebFetcher",
-"webexplorer","IBM-WebExplorer",
-"webmirror","WebMirror",
-"webvcr","WebVCR",
-# Site grabbers
-"teleport","TelePort Pro (site grabber)",
-"webcapture","Acrobat (site grabber)",
-"webcopier", "WebCopier (site grabber)",
-# Music only browsers
-"real","RealAudio or compatible (media player)",
-"winamp","WinAmp (media player)",                              # Works for winampmpeg and winamp3httprdr
-"windows-media-player","Windows Media Player (media player)",
-"audion","Audion (media player)",
-"freeamp","FreeAmp (media player)",
-"itunes","Apple iTunes (media player)",
-"jetaudio","JetAudio (media player)",
-"mint_audio","Mint Audio (media player)",
-"mpg123","mpg123 (media player)",
-"nsplayer","NetShow Player (media player)",
-"sonique","Sonique (media player)",
-"uplayer","Ultra Player (media player)",
-"xmms","XMMS (media player)",
-"xaudio","Some XAudio Engine based MPEG player (media player)",
-# PDA/Phonecell browsers
-"mmef","Microsoft Mobile Explorer (PDA/Phone browser)",
-"mspie","MS Pocket Internet Explorer (PDA/Phone browser)",
-"up\.","UP.Browser (PDA/Phone browser)",                                       # Works for UP.Browser and UP.Link
-"wapalizer","WAPalizer (PDA/Phone browser)",
-"wapsilon","WAPsilon (PDA/Phone browser)",
-"webcollage","WebCollage (PDA/Phone browser)",
-"alcatel","Alcatel Browser (PDA/Phone browser)",
-"nokia","Nokia Browser (PDA/Phone browser)",
-# Others (TV)
-"webtv","WebTV browser",
-# Other kind of browsers
-"csscheck","WDG CSS Validator",
-"w3m","w3m",
-"w3c_css_validator","W3C CSS Validator",
-"w3c_validator","W3C HTML Validator",
-"wdg_validator","WDG HTML Validator",
-"webzip","WebZIP",
-"staroffice","StarOffice",
-"libwww","LibWWW"
-);
-
 1;
index 8ac97b021da0de36dd7e44a91cde1c767a88f3fc..6c704f562652dcd4ca1e173ed82937d35902b7ea 100644 (file)
 # AWSTATS SEARCH ENGINES DATABASE
-#--------------------------------
-# Last change $Revision$ - $Author$ - $Date$
+#-------------------------------------------------------
+# If you want to add a Search Engine to extend AWStats database detection capabilities,
+# you must add an entry in SearchEnginesSearchIDOrder and in SearchEnginesHashIDLib.
+# An entry if known in SearchEnginesKnownUrl is also welcome.
+#-------------------------------------------------------
+# $Revision$ - $Author$ - $Date$
 
 
-# Search engines names database
-# To add a search engine, add a new line:
+# SearchEnginesSearchIDOrder
+# This list is used to know in which order to search Search Engines IDs (Most
+# frequent one are first in this list to increase detect speed).
+# Note: Browsers IDs are in lower case and ' ' and '+' are changed into '_'
+#-----------------------------------------------------------------
+@SearchEnginesSearchIDOrder=(
+# Major internationnal search engines
+"google\.",
+"msn\.",
+"voila\.",
+"yahoo\.",
+"lycos\.",
+"altavista\.",
+"search\.terra\.",
+"alltheweb\.com",
+"netscape\.",
+"dmoz\.org",
+"search\.aol\.co",
+"www\.search\.com",
+"overture\.com",               # Replace "goto\.com","Goto.com",
+# Minor internationnal search engines
+"northernlight\.",
+"hotbot\.",
+"kvasir\.",
+"webcrawler\.",
+"metacrawler\.",
+"go2net\.com",
+"go\.com",
+"euroseek\.",
+"excite\.",
+"lokace\.",
+"spray\.",
+"netfind\.aol\.com",
+"recherche\.aol\.fr",
+"nbci\.com/search",
+"askjeeves\.",
+"mamma\.",
+"dejanews\.",
+"search\.dogpile\.com",
+"wisenut\.com",
+# Minor brazilian search engines
+"engine\.exe", "miner\.bol\.com\.br",
+# Minor danish search-engines 
+"opasia\.dk", "danielsen\.com",
+# Minor dutch search engines
+"ilse\.","vindex\.",
+# Minor english search engines
+"splut\.","ukplus\.","mirago\.","ukindex\.co\.uk","ukdirectory\.",
+# Minor finnish search engines
+"haku\.www\.fi",
+# Minor french search engines
+"nomade\.fr/","ctrouve\.","francite\.","\.lbb\.org","rechercher\.libertysurf\.fr",
+# Minor german search engines
+"fireball\.de","infoseek\.de","suche\.web\.de","meta\.ger",
+# Minor italian search engines
+"virgilio\.it",
+# Minor norvegian search engines
+"sok\.start\.no",
+# Minor swedish search engines
+"evreka\.passagen\.se",
+# Minor czech search engines
+"atlas\.cz","seznam\.cz","quick\.cz","centrum\.cz","najdi\.to","redbox\.cz",
+# Other
+"search\..*com"
+);
+
+
+# SearchEnginesHashIDLib
+# List of search engines names
 # "match_string_in_url_that_identify_engine", "search_engine_name",
 #-----------------------------------------------------------------
 %SearchEnginesHashIDLib=(
-# Most common search engines
-"yahoo\.","Yahoo",
-"altavista\.","AltaVista",
+# Major internationnal search engines
+"google\.","Google",
 "msn\.","MSN",
 "voila\.", "Voila",
+"yahoo\.","Yahoo",
 "lycos\.","Lycos",
+"altavista\.","AltaVista",
 "search\.terra\.","Terra",
-"google\.","Google",
 "alltheweb\.com","AllTheWeb",
 "netscape\.","Netscape",
-"northernlight\.","NorthernLight",
 "dmoz\.org","DMOZ",
 "search\.aol\.co","AOL",
 "www\.search\.com","Search.com",
-"kvasir\.","Kvasir",
-# Others
+"overture\.com","Overture",            # Replace "goto\.com","Goto.com",
+# Minor internationnal search engines
+"northernlight\.","NorthernLight",
 "hotbot\.","Hotbot",
+"kvasir\.","Kvasir",
 "webcrawler\.","WebCrawler",
 "metacrawler\.","MetaCrawler (Metamoteur)",
 "go2net\.com","Go2Net (Metamoteur)",
 "go\.com","Go.com",
-"overture\.com","Overture",            # Replace "goto\.com","Goto.com",
 "euroseek\.","Euroseek",
 "excite\.","Excite",
 "lokace\.", "Lokace",
 "dejanews\.","DejaNews",
 "search\.dogpile\.com","Dogpile",
 "wisenut\.com","WISENut",
-"engine\.exe","Cade", "miner\.bol\.com\.br","Meta Miner",              # Minor brazilian search engines
-"opasia\.dk","Opasia", "danielsen\.com","Thor (danielsen.com)",        # Minor danish search-engines 
-"ilse\.","Ilse","vindex\.","Vindex\.nl",                                               # Minor dutch search engines
-"splut\.","Splut", "ukplus\.", "UKPlus", "mirago\.", "Mirago", "ukindex\.co\.uk", "UKIndex", "ukdirectory\.","UK Directory", # Minor english search engines
-"haku\.www\.fi","Ihmemaa",                                                                             # Minor finnish search engines
-"nomade\.fr/","Nomade", "ctrouve\.","C'est trouvé", "francite\.","Francité", "\.lbb\.org", "LBB", "rechercher\.libertysurf\.fr", "Libertysurf",        # Minor french search engines
-"fireball\.de","Fireball", "infoseek\.de","Infoseek", "suche\.web\.de","Web.de", "meta\.ger","MetaGer",        # Minor german search engines
-"virgilio\.it","Virgilio",                                                                             # Minor italian search engines
-"sok\.start\.no","start.no",                                                                   # Minor norvegian search engines
-"evreka\.passagen\.se","Evreka",                                                               # Minor swedish search engines
-"atlas\.cz","Atlas.cz",        "seznam\.cz","Seznam.cz", "quick\.cz","Quick.cz", "centrum\.cz","Centrum.cz",   #Minor czech search engines
+# Minor brazilian search engines
+"engine\.exe","Cade", "miner\.bol\.com\.br","Meta Miner",
+# Minor danish search-engines 
+"opasia\.dk","Opasia", "danielsen\.com","Thor (danielsen.com)",        
+# Minor dutch search engines
+"ilse\.","Ilse","vindex\.","Vindex\.nl",                                               
+# Minor english search engines
+"splut\.","Splut", "ukplus\.", "UKPlus", "mirago\.", "Mirago", "ukindex\.co\.uk", "UKIndex", "ukdirectory\.","UK Directory",
+# Minor finnish search engines
+"haku\.www\.fi","Ihmemaa",                                                                             
+# Minor french search engines
+"nomade\.fr/","Nomade", "ctrouve\.","C'est trouvé", "francite\.","Francité", "\.lbb\.org", "LBB", "rechercher\.libertysurf\.fr", "Libertysurf",        
+# Minor german search engines
+"fireball\.de","Fireball", "infoseek\.de","Infoseek", "suche\.web\.de","Web.de", "meta\.ger","MetaGer",        
+# Minor italian search engines
+"virgilio\.it","Virgilio",                                                                             
+# Minor norvegian search engines
+"sok\.start\.no","start.no",                                                                   
+# Minor swedish search engines
+"evreka\.passagen\.se","Evreka",                                                               
+# Minor czech search engines
+"atlas\.cz","Atlas.cz",        "seznam\.cz","Seznam.cz", "quick\.cz","Quick.cz", "centrum\.cz","Centrum.cz","najdi\.to","Najdi.to","redbox\.cz","RedBox.cz",
+# Other
 "search\..*com","Other search engines"
 );
 
-# Search engines known URLs rules to find keywords
+
+# SearchEnginesKnownUrl
+# Search engines known rules to extract keywords from a referrer URL
 #-------------------------------------------------
 %SearchEnginesKnownUrl=(
 # Most common search engines
 "fireball\.de","q=", "infoseek\.de","qt=", "suche\.web\.de","su=",                     # Minor german search engines
 "sok\.start\.no", "q=",                                                                                        # Minor norvegian search engines
 "evreka\.passagen\.se","q=",                                                                           # Minor swedish search engines
-"atlas\.cz","searchtext=", "seznam\.cz","w=", "ftxt\.quick\.cz","query=", "centrum\.cz","q="           # Minor czech search engines
+"atlas\.cz","searchtext=", "seznam\.cz","w=", "ftxt\.quick\.cz","query=", "centrum\.cz","q=", "najdi\.to","dotaz=", "redbox.cz","srch="                # Minor czech search engines
 );
+
 # If no rules are known, this will be used to clean URL of not keyword parameters.
 @WordsToCleanSearchUrl= ("act=","annuaire=","btng=","categoria=","cfg=","cof=","cou=","cp=","dd=","domain=","dt=","dw=","exec=","geo=","hc=","height=","hl=","hq=","hs=","id=","kl=","lang=","loc=","lr=","matchmode=","medor=","message=","meta=","mode=","order=","page=","par=","pays=","pg=","pos=","prg=","qc=","refer=","sa=","safe=","sc=","sort=","src=","start=","style=","stype=","sum=","tag=","temp=","theme=","url=","user=","width=","what=","\\.x=","\\.y=","y=","look=");
 # Never put the following exclusion ("ask=","claus=","general=","kw=","keyword=","keywords=","MT","p=","q=","qr=","qt=","query=","s=","search=","searchText=","string=","su=","w=") because they are strings that contain keywords we're looking for.