From b8acd3f3fd2f6b35b354208be7ec0078386e6397 Mon Sep 17 00:00:00 2001 From: Laurent Destailleur Date: Mon, 16 Mar 2015 01:02:01 +0100 Subject: [PATCH] =?utf8?q?More=20robots=20and=20search=20engines=20from=20?= =?utf8?q?Albrecht=20M=C3=BCller?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- wwwroot/cgi-bin/lib/robots.pm | 259 ++++++++++++++++++++------ wwwroot/cgi-bin/lib/search_engines.pm | 117 +++++++++++- 2 files changed, 313 insertions(+), 63 deletions(-) diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index 5459bf2e..73b1c15f 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -8,17 +8,38 @@ # expressions to the user agent string in the order given by the lists. The # first match specifies the robot. # +# Note: This regular expression must not contain any whitespace. +# Otherwise AWStats will produce lines in the database that +# will be misinterpreted and as a consequence the corresponding data in the +# generated HTML reports will be wrong. If you want to match whitespace in +# the user agent string, use other constructs like '\s', '[:blank:]', +# '\p{IsSpace}', '\x20' etc. +# # The corresponding entry in RobotsHashIDLib contains the regular expression # as key, followed by a string containing HTML-text. AWStats inserts this # text into reports to describe the bot. If possible the text should contain -# a link to the bot home page. This make it easier for systadmins to find +# a link to the bot home page. This makes it easier for sysadmins to find # the information necessary e.g. to adapt the robots.txt file. # # An entry in the RobotsAffiliateLib is not necessary. An entry in this list # contains as first part the regular expression specifying the bot. The # second part is a string that gives the Company or product managing the bot. # This information is not used yet. - +# +# There are several sorts of bots that AWStats is not able to detect and +# therefore a considerable amount of bot generated traffic counts +# as user traffic: +# +# a) A crawler that identifies itself in the referrer string, but not in +# the user agent string. An example is the crawler from semalt.semalt.com. +# +# b) Crawlers that correctly access robots.txt but identify themselves in +# in the user agent string only once or just a few times. Most of the +# time a user agent string ist used that does not contain hints that +# a bot is involved. An example is the iCjobs spider. +# +# +# #------------------------------------------------------- # 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html @@ -318,7 +339,7 @@ # added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ # -- fix - some robots were reported with _ where _ should have been a space. # changed Xenu Link Sleuth -# changed microsoft[_+ ]url[_+ ]control -> microsoft_url_control +# changed microsoft[_+\s]url[_+\s]control -> microsoft_url_control # changed favorites_sweeper -> favorites_sweeper # -- updates # updated AskJeeves to Ask @@ -353,7 +374,7 @@ # used to know in which order to search Robot IDs. # Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more # Minor robots are in list2, used when LevelForRobotsDetection is 2 or more -# Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+ ]' and are quoted. +# Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+\s]' and are quoted. #------------------------------------------------------- @RobotsSearchIDOrder_list1 = ( # Common robots (In robot file) @@ -366,16 +387,16 @@ 'googlebot\-image', 'googlebot', 'google\-sitemaps', -'google[_+ ]web[_+ ]preview', +'google[_+\s]web[_+\s]preview', 'grabber', 'gulliver', -'virus[_+ ]detector', # Must be before harvest +'virus[_+\s]detector', # Must be before harvest 'harvest', 'htdig', 'jeeves', 'linkwalker', 'lilina', -'lycos[_+ ]', +'lycos[_+\s]', 'moget', 'muscatferret', 'myweb', @@ -396,6 +417,7 @@ 'mercator', 'netcraft', 'msnbot\-media', +'msnbot-udiscovery', 'msnbot', 'petersnews', 'relevantnoise\.com', @@ -409,6 +431,7 @@ ); @RobotsSearchIDOrder_list2 = ( # Less common robots (In robot file) +'007ac9', '[^a]fish', 'abcdatos', 'abonti\.com', @@ -445,12 +468,14 @@ 'christcrawler', 'churl', 'cienciaficcion', +'cms\scrawler', 'collective', 'combine', 'conceptbot', 'coolbot', 'core', 'cosmos', +'crazywebcrawler', 'cruiser', 'cusco', 'cyberspyder', @@ -461,6 +486,7 @@ 'diibot', 'direct_hit', 'dnabot', +'domainappender', 'download_express', 'dragonbot', 'dwcp', @@ -528,7 +554,7 @@ 'kapsi', 'katipo', 'kilroy', -'ko[_+ ]yappo[_+ ]robot', +'ko[_+\s]yappo[_+\s]robot', 'kummhttp', 'labelgrabber\.txt', 'larbin', @@ -597,6 +623,8 @@ 'roverbot', 'rules', 'safetynetrobot', +'semalt', #Note: This entry will not work as this crawler identifies itself +# in the referrer string and not in the user agent string 'search\-info', 'search_au', 'searchprocess', @@ -606,6 +634,7 @@ 'shaihulud', 'sift', 'simbot', +'sistrix', 'site\-valet', 'sitetech', 'skymob', @@ -614,7 +643,7 @@ 'snooper', 'solbot', 'speedy', -'spider[_+ ]monkey', +'spider[_+\s]monkey', 'spiderbot', 'spiderline', 'spiderman', @@ -681,6 +710,7 @@ 'woozweb', 'wwwc', 'wz101', +'xenu\slink\ssleuth', 'xget', # Other robots reported by users '1\-more_scanner', @@ -690,12 +720,15 @@ 'activebookmark', 'adamm_bot', 'adsbot-google', +'advbot', +'affectv\.co\.uk', 'almaden', 'aipbot', 'aleadsoftbot', 'alpha_search_agent', 'allrati', 'aport', +'archive\-de\.com', 'archive\.org_bot', 'argus', # Must be before nutch 'arianna\.libero\.it', @@ -709,8 +742,9 @@ 'betabot', 'biglotron', 'bittorrent_bot', -'biz360[_+ ]spider', -'blogbridge[_+ ]service', +'biz360[_+\s]spider', +'blexbot', +'blogbridge[_+\s]service', 'bloglines', 'blogpulse', 'blogsearch', @@ -724,13 +758,15 @@ 'boris', 'bubing', 'bumblebee', -'candlelight[_+ ]favorites[_+ ]inspector', +'candlelight[_+\s]favorites[_+\s]inspector', 'careerbot', 'cbn00glebot', +'ccbot', 'cerberian_drtrs', 'cfnetwork', 'cipinetbot', 'checkweb_link_validator', +'cliqzbot', 'commons\-httpclient', 'computer_and_automation_research_institute_crawler', 'converamultimediacrawler', @@ -749,8 +785,10 @@ 'deepindex', 'dipsie\.bot', 'dnsgroup', +'doccheckbot', 'domainchecker', 'domainsdb\.net', +'dotbot', 'dulance', 'dumbot', 'dumm\.de\-bot', @@ -758,20 +796,24 @@ 'easydl', 'eccp', 'edgeio\-retriever', +'ernst[:blank:]2\.0', 'ets_v', 'exactseek', -'extreme[_+ ]picture[_+ ]finder', +'extreme[_+\s]picture[_+\s]finder', 'eventax', 'everbeecrawler', 'everest\-vulcan', 'ezresult', 'enteprise', 'facebook', +'facebot', 'fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', +'finderlein[_+\s]research[_+\s]crawler', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler', # must come before fast enterprise crawler 'fast_enterprise_crawler', 'fast\-search\-engine', +'fastbot', 'favicon', 'favorg', 'favorites_sweeper', @@ -781,6 +823,7 @@ 'feedster', 'feedsky', 'feedvalidator', +'fetchbot', 'filmkamerabot', 'filterdb\.iss\.net', 'findlinks', @@ -791,6 +834,7 @@ 'g2crawler', 'gaisbot', 'geniebot', +'gigablastopensource', 'gigabot', 'girafabot', 'global_fetch', @@ -807,11 +851,13 @@ 'hoowwwer', 'hpprint', 'htmlparser', -'html[_+ ]link[_+ ]validator', +'html[_+\s]link[_+\s]validator', 'httrack', 'hundesuche\.com\-bot', 'i-bot', +'icarus6j', 'ichiro', +'idmarch', 'iltrovatore\-setaccio', 'infobot', 'infociousbot', @@ -819,7 +865,7 @@ 'infomine', 'insurancobot', 'integromedb\.org', -'internet[_+ ]ninja', +'internet[_+\s]ninja', 'internetarchive', 'internetseer', 'internetsupervision', @@ -828,7 +874,9 @@ 'isearch2006', 'istellabot', 'iupui_research_bot', -'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility', +'izsearch', +'james\sbot', +'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility', 'justview', 'kalambot', 'kamano\.de_newsfeedverzeichnis', @@ -851,21 +899,30 @@ 'link_valet_online', 'metager\-linkchecker', # Must be before linkchecker 'linkchecker', +'lipperhey', 'livejournal\.com', 'lmspider', +'loadtimebot', +'lssrocketcrawler', 'ltbot', +'ltx71', 'lwp\-request', 'lwp\-trivial', +'madaali\.de', 'magpierss', 'mail\.ru', 'mapoftheinternet\.com', +'meanpathbot', +'mediabot', 'mediapartners\-google', 'megite', +'memorybot', +'metager2-verification-bot', 'metaspinner', 'miadev', 'microsoft bits', 'microsoft.*discovery', # = 'microsoft (?:office (?:protocol|existence)|data access internet publishing provider protocol) discovery', -'microsoft[_+ ]url[_+ ]control', +'microsoft[_+\s]url[_+\s]control', 'mini\-reptile', 'minirank', 'missigua_locator', @@ -875,6 +932,7 @@ 'mj12bot', 'mojeekbot', 'msiecrawler', +'ms[_+\s]search[_+\s]6\.0[_+\s]robot', 'ms_search_4\.0_robot', 'msrabot', 'msrbot', @@ -890,17 +948,19 @@ 'nimblecrawler', 'noxtrumbot', 'npbot', +'loocalcrawler/nutch', 'nutchcvs', 'nutchosu\-vlib', 'nutch', # Must come after other nutch versions 'ocelli', 'octora_beta_bot', -'omniexplorer[_+ ]bot', -'onet\.pl[_+ ]sa', +'omniexplorer[_+\s]bot', +'onet\.pl[_+\s]sa', 'onfolio', 'opentaggerbot', 'openwebspider', 'oracle_ultra_search', +'orangebot', 'orbiter', 'yodaobot', 'qihoobot', @@ -908,7 +968,9 @@ 'pear_http_request_class', 'peerbot', 'perman', -'php[_+ ]version[_+ ]tracker', +'php[_+\s]version[_+\s]tracker', +'phpcrawl', +'picmole', 'pictureofinternet', 'ping\.blo\.gs', 'plinki', @@ -920,28 +982,34 @@ 'postfavorites', 'projectwf\-java\-test\-crawler', 'proodlebot', +'publiclibraryarchive', 'pyquery', 'rambler', 'redalert', +'rogerbot', 'rojo', 'rssimagesbot', 'ruffle', 'rufusbot', 'sandcrawler', +'savetheworldheritage', 'sbider', 'schizozilla', 'scumbot', -'searchguild[_+ ]dmoz[_+ ]experiment', +'searchguild[_+\s]dmoz[_+\s]experiment', 'searchmetricsbot', 'seekbot', 'semrushbot', 'sensis_web_crawler', +'seodiver', 'seokicks\.de', 'seznambot', 'shim\-crawler', 'shoutcast', +'sitedomain-bot', 'siteexplorer\.info', 'slysearch', +'smtbot', 'snap\.com_beta_crawler', 'sohu\-search', 'sohu', # "sohu agent" @@ -954,6 +1022,7 @@ 'ssearch_bot', 'steeler', 'steroid__download', +'stq_bot', 'suchfin\-bot', 'superbot', 'surveybot', @@ -988,7 +1057,7 @@ 'vortex', 'vse\/', 'w3c\-checklink', -'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', +'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', 'w3c_validator', 'watchmouse', 'wavefire', @@ -1001,16 +1070,19 @@ 'webfilter', 'webindexer', 'webminer', -'website[_+ ]monitoring[_+ ]bot', +'website[_+\s]monitoring[_+\s]bot', 'webvulncrawl', 'wells_search', 'wesee:search', +'wevikabot', 'wonderer', +'wotbox', 'wume_crawler', 'wwweasel', 'xenu\'s_link_sleuth', 'xenu_link_sleuth', 'xirq', +'xovibot', 'y!j', # Must come after keyoshid Y!J 'yacy', 'yahoo\-blogs', @@ -1074,13 +1146,13 @@ '^motorola$', 'movabletype', # These appear to be bots trying to hide. All of the usual architecture data is missing. -'^mozilla\/3\.0 \(compatible$', +'^mozilla\/3\.0\s\(compatible$', '^mozilla\/4\.0$', -'^mozilla\/4\.0 \(compatible;\)$', +'^mozilla\/4\.0\s\(compatible;\)$', '^mozilla\/5\.0$', -'^mozilla\/5\.0 \(compatible;$', -'^mozilla\/5\.0 \(en\-us\)$', -'^mozilla\/5\.0 firefox\/3\.0\.5$', +'^mozilla\/5\.0\s\(compatible;$', +'^mozilla\/5\.0\s\(en\-us\)$', +'^mozilla\/5\.0\sfirefox\/3\.0\.5$', '^msie', # End of hiding bots. 'netnewswire', @@ -1157,6 +1229,9 @@ 'spider', 'sucker', 'bot[\s_+:,\.\;\/\\\-]', +# Identifies +#"Mozilla/5.0 (Linux; U; Android 4.2.2; de-de; CUBOT P9 Build/JDQ39) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30" +#as a but. There is a Android mobile phone called "CUBOT P9", so this is probably not a bot. '[\s_+:,\.\;\/\\\-]bot', 'curl', 'php', @@ -1181,15 +1256,15 @@ 'googlebot','Googlebot', 'google\-sitemaps', 'Google Sitemaps', 'grabber', 'Grabber (SDSC)', -'google[_+ ]web[_+ ]preview', 'Google Web Preview', +'google[_+\s]web[_+\s]preview', 'Google Web Preview', 'gulliver','Northern Light Gulliver', -'virus[_+ ]detector','virus_detector', +'virus[_+\s]detector','virus_detector', 'harvest','Harvest', 'htdig','ht://Dig', 'jeeves','Ask', 'linkwalker','LinkWalker', 'lilina','Lilina', -'lycos[_+ ]','Lycos', +'lycos[_+\s]','Lycos', 'moget','moget', 'muscatferret','Muscat Ferret', 'myweb','Internet Shinchakubin', @@ -1209,6 +1284,7 @@ 'jennybot','JennyBot', 'mercator','Mercator', 'msnbot\-media','MSNBot-media', +'msnbot-udiscovery', 'msnbot-UDiscovery Note: Most traffic counts as user traffic', 'msnbot','MSNBot', 'netcraft','Netcraft', 'petersnews','Petersnews', @@ -1220,6 +1296,7 @@ 'webcollage','WebCollage', 'cfetch','Cfetch', # Less common robots (In robot file) +'007ac9', '007ac9 Crawler, seems to belong to SISTRIX', '[^a]fish','Fish search', 'abcdatos','ABCdatos BotLink', 'abonti\.com','Abonti WebSearch', @@ -1256,12 +1333,14 @@ 'christcrawler','ChristCrawler.com', 'churl','churl', 'cienciaficcion','cIeNcIaFiCcIoN.nEt', +'cms\scrawler', 'CMS Crawler', 'collective','Collective', 'combine','Combine System', 'conceptbot','Conceptbot', 'coolbot','CoolBot', 'core','Web Core / Roots', 'cosmos','XYLEME Robot', +'crazywebcrawler', 'CrazyWeb Crawler', 'cruiser','Internet Cruiser Robot', 'cusco','Cusco', 'cyberspyder','CyberSpyder Link Test', @@ -1272,6 +1351,7 @@ 'diibot','Digital Integrity Robot', 'direct_hit','Direct Hit Grabber', 'dnabot','DNAbot', +'domainappender', 'DomainAppender', 'download_express','DownLoad Express', 'dragonbot','DragonBot', 'dwcp','DWCP (Dridus\' Web Cataloging Project)', @@ -1314,13 +1394,16 @@ 'iaskspider','Sina Iask Spider', 'hl_ftien_spider','Hylanda', 'sogou','Sogou Spider', -'icjobs\.de', 'iCjobs Spider', +'icjobs\.de', 'iCjobs Spider Note: Most traffic counts as user traffic', #20130805 The user agent string of the icjobs-spider contained the #identifying string only when it accessed the robots.txt file. #When it accessed the actual content it did not identify itself as #a spider. Thus traffic of this spider was counted as user traffic. #The behavious seems to have changed now - the spider identifies itself #when it accesses content pages. +#20141401 Behavior as before: Does identify itself when it accesses +# robots.txt and the root page. The following traffic does not contain +# the identification string and is therefore counted as user traffic. 'iconoclast','Popular Iconoclast', 'ilse','Ingrid', 'imagelock','Imagelock', @@ -1346,7 +1429,7 @@ 'kapsi','image.kapsi.net', 'katipo','Katipo', 'kilroy','Kilroy', -'ko[_+ ]yappo[_+ ]robot','KO_Yappo_Robot', +'ko[_+\s]yappo[_+\s]robot','KO_Yappo_Robot', 'kummhttp','KummHttp', 'labelgrabber\.txt','LabelGrabber', 'larbin','larbin', @@ -1418,6 +1501,7 @@ 'roverbot','Roverbot', 'rules','RuLeS', 'safetynetrobot','SafetyNet Robot', +'semalt', 'seamalt.com', 'search\-info','Sleek', 'search_au','Search.Aus-AU.COM', 'searchprocess','SearchProcess', @@ -1427,6 +1511,7 @@ 'shaihulud','Shai\'Hulud', 'sift','Sift', 'simbot','Simmany Robot Ver1.0', +'sistrix', 'SISTRIX Crawler', 'site\-valet','Site Valet', 'sitetech','SiteTech-Rover', 'skymob','Skymob.com', @@ -1435,7 +1520,7 @@ 'snooper','Snooper', 'solbot','Solbot', 'speedy','Speedy Spider', -'spider[_+ ]monkey','Spider monkey', +'spider[_+\s]monkey','Spider monkey', 'spiderbot','SpiderBot', 'spiderline','Spiderline Crawler', 'spiderlytics', 'Spiderlytics: No homepage, e-mail only: spider (at) spiderlytics.com', @@ -1503,6 +1588,7 @@ 'woozweb','Woozweb Monitoring', 'wwwc','WWWC Ver 0.2.5', 'wz101','WebZinger', +'xenu\slink\ssleuth', 'Xenu'. "'" . 's Link Sleuth (TM), see Wikipedia', 'xget','XGET', # Other robots reported by users '1\-more_scanner','1-More Scanner', @@ -1512,12 +1598,15 @@ 'activebookmark','ActiveBookmark', 'adamm_bot','AdamM Bot', 'adsbot-google', 'AdsBot-Google', +'advbot', 'AdvBot', +'affectv\.co\.uk', 'affectv.co.uk', 'almaden','IBM Almaden Research Center WebFountain™', 'aipbot','aipbot', 'aleadsoftbot','ALeadSoftbot', 'alpha_search_agent','Alpha Search Agent', 'allrati','Allrati', 'aport', 'Aport', +'archive\-de\.com', 'Archive-de.com', 'archive\.org_bot','archive.org bot', 'argus','Argus', 'arianna\.libero\.it','arianna.libero.it', @@ -1531,8 +1620,9 @@ 'betabot','BetaBot', 'biglotron','Biglotron', 'bittorrent_bot','BitTorrent Bot', -'biz360[_+ ]spider','Biz360 spider', -'blogbridge[_+ ]service','BlogBridge Service', +'biz360[_+\s]spider','Biz360 spider', +'blexbot', 'BLEXBot, seems to belong to the WebMeUp backlink tool', +'blogbridge[_+\s]service','BlogBridge Service', 'bloglines','Bloglines', 'blogpulse','BlogPulse ISSpider intelliseek.com', 'blogsearch','BlogSearch', @@ -1546,13 +1636,15 @@ 'boris', 'Boris', 'bubing', 'BUbiNG', 'bumblebee', 'Bumblebee (relevare.com)', -'candlelight[_+ ]favorites[_+ ]inspector','Candlelight_Favorites_Inspector', +'candlelight[_+\s]favorites[_+\s]inspector','Candlelight_Favorites_Inspector', 'careerbot', 'CareerBot', 'cbn00glebot','cbn00glebot', +'ccbot', 'Common Crawl', 'cerberian_drtrs','Cerberian Drtrs', 'cfnetwork','CFNetwork', 'cipinetbot','CipinetBot', 'checkweb_link_validator','CheckWeb link validator', +'cliqzbot', 'Cliqzbot', 'commons\-httpclient','Jakarta commons-httpclient', 'computer_and_automation_research_institute_crawler','Computer and Automation Research Institute Crawler', 'converamultimediacrawler','ConveraMultiMediaCrawler', @@ -1570,8 +1662,10 @@ 'deepindex','Deepindex', 'dipsie\.bot','Dipsie', 'dnsgroup','DNSGroup', +'doccheckbot', 'doccheckbot/1.0, known to Project Honey Pot', 'domainchecker','DomainChecker', 'domainsdb\.net','DomainsDB.net', +'dotbot', 'DotBot, Open Site Explorer', 'dulance','Dulance', 'dumbot','Dumbot', 'dumm\.de\-bot','dumm.de-Bot', @@ -1579,20 +1673,24 @@ 'easydl','EasyDL', 'eccp', 'Eniro Sverige, email: search (at) eniro.com', 'edgeio\-retriever','edgeio-retriever', +'ernst[:blank:]2\.0', 'Ernst 2.0 (does not provide any further information)', 'ets_v','ETS Enterprise Translation Server', 'exactseek','ExactSeek Crawler', -'extreme[_+ ]picture[_+ ]finder','Extreme_Picture_Finder', +'extreme[_+\s]picture[_+\s]finder','Extreme_Picture_Finder', 'eventax','eventax', 'everbeecrawler','EverbeeCrawler', 'everest\-vulcan','Everest-Vulcan', 'ezresult', 'Ezresult', 'enteprise','Fast Enteprise Crawler', 'facebook','FaceBook bot', +'facebot', 'Facebot (Facebook bot?)', 'fast\-search\-engine','Fast-Search-Engine (not fastsearch.com)', 'fast_enterprise_crawler','FAST Enterprise Crawler', 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * crawleradmin.t-info@telekom.de', +'finderlein[_+\s]research[_+\s]crawler', 'Finderlein Research Crawler 1.0 (no contact information given)', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler','Matrix S.p.A. - FAST Enterprise Crawler', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de', +'fastbot', 'fastbot', 'favicon','FavIconizer', 'favorg','FavOrg', 'favorites_sweeper','Favorites Sweeper', @@ -1602,6 +1700,7 @@ 'feedster','Feedster', 'feedsky','FeedSky', 'feedvalidator','FeedValidator', +'fetchbot', 'Fetchbot', 'filmkamerabot','FilmkameraBot', 'filterdb\.iss\.net', 'oBot', 'findexa_crawler','Findexa Crawler', @@ -1612,6 +1711,7 @@ 'g2crawler','G2Crawler', 'gaisbot','Gaisbot', 'geniebot','Geniebot', +'gigablastopensource', 'GigablastOpenSource, an Open Source Search Engine(Wiki)', 'gigabot','GigaBot', 'girafabot','Girafabot', 'global_fetch','Global Fetch', @@ -1628,11 +1728,13 @@ 'hoowwwer','HooWWWer', 'hpprint','HPPrint', 'htmlparser','HTMLParser', -'html[_+ ]link[_+ ]validator','Html_Link_Validator', +'html[_+\s]link[_+\s]validator','Html_Link_Validator', 'httrack','HTTrack off-line browser', 'hundesuche\.com\-bot','Hundesuche.com-Bot', 'i-bot','i-bot', +'icarus6j', 'Icarus6j, email address in UA string, no website', 'ichiro','ichiro', +'idmarch', 'IDMARCH', 'iltrovatore\-setaccio','IlTrovatore-Setaccio', 'infobot','InfoBot', 'infociousbot','InfociousBot', @@ -1640,7 +1742,7 @@ 'infomine','INFOMINE VLCrawler', 'insurancobot','InsurancoBot', 'integromedb\.org','IntegromeDB', -'internet[_+ ]ninja','Internet_Ninja ', +'internet[_+\s]ninja','Internet_Ninja ', 'internetarchive','InternetArchive', 'internetseer', 'InternetSeer', 'internetsupervision','InternetSupervision', @@ -1648,7 +1750,9 @@ 'isearch2006','isearch2006', 'istellabot', 'IstellaBot', 'iupui_research_bot','IUPUI_Research_Bot', -'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility','JRTwine_Software_Check_Favorites_Utility', +'izsearch', 'iZSearch', +'james\sbot', 'James BOT', +'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility','JRTwine_Software_Check_Favorites_Utility', 'justview', 'JustView', 'kalambot','KalamBot', 'kamano\.de_newsfeedverzeichnis','kamano.de NewsFeedVerzeichnis', @@ -1670,19 +1774,31 @@ 'linkbot','LinkBot', 'linkdex\.com', 'Linkdex', 'linkchecker','LinkChecker', +'lipperhey', 'Lipperhey SEO Service', 'livejournal\.com', 'LiveJournal.com', +'loadtimebot', 'LoadTimeBot', +'lssrocketcrawler', 'LSSRocketCrawler (no contact information)', 'ltbot', 'Language Tools Bot (ltbot)', +'ltx71', 'ltx71', +'madaali\.de', 'www.madaali.de', 'magpierss', 'MagpieRSS', 'mail\.ru', 'Mail.ru bot', 'mapoftheinternet\.com','MapoftheInternet.com', +'meanpathbot', 'Meanpathbot', +'mediabot', 'MediaBot', 'mediapartners\-google','Google AdSense', +# 'Mediapartners-Google (Feb 12, 2015: no additial information in UA String, seems to use GigablastOpenSource', +# Uses UA string "Mediapartners-Google" only, and there were accesses using an UA string "GigablastOpenSource/1.0" from the same IP-Address. +# Therefore this is probably not related to Google 4.3.2015 Albrecht Müller 'megite','Megite', +'memorybot', 'Archivethe.net', +'metager2-verification-bot', 'metager2-verification-bot', 'metager\-linkchecker','MetaGer LinkChecker', 'metaspinner','Metaspinner', 'miadev', 'MiaDev spider', 'microsoft bits', 'Microsoft Background Intelligent Transfer Service (BITS)?', 'microsoft.*discovery', 'Microsoft Office Protocol Discovery/Microsoft Office Existence Discovery', -'microsoft[_+ ]url[_+ ]control','Microsoft URL Control', +'microsoft[_+\s]url[_+\s]control','Microsoft URL Control', 'minirank','miniRank', 'mini\-reptile','Mini-reptile', 'missigua_locator','Missigua_Locator', @@ -1692,6 +1808,7 @@ 'mj12bot','MJ12bot', 'mojeekbot','MojeekBot', 'msiecrawler','MSIECrawler', +'ms[_+\s]search[_+\s]6\.0[_+\s]robot','MS Search 6.0 Robot (MS SharePoint Portal Server?)', 'ms_search_4\.0_robot','MS SharePoint Portal Server - MS Search 4.0 Robot', 'msrabot','msrabot', 'msrbot','MSRBOT', @@ -1707,17 +1824,19 @@ 'nimblecrawler','NimbleCrawler', 'noxtrumbot','noxtrumbot', 'npbot','NPBot', +'loocalcrawler/nutch', 'LoocalCrawler/Nutch', 'nutchcvs','NutchCVS', 'nutchosu\-vlib','NutchOSU-VLIB', 'nutch','Nutch', 'ocelli','Ocelli', 'octora_beta_bot','Octora Beta Bot', -'omniexplorer[_+ ]bot','OmniExplorer Bot', -'onet\.pl[_+ ]sa','Onet.pl_SA', +'omniexplorer[_+\s]bot','OmniExplorer Bot', +'onet\.pl[_+\s]sa','Onet.pl_SA', 'onfolio','Onfolio', 'opentaggerbot','OpenTaggerBot', 'openwebspider','OpenWebSpider', 'oracle_ultra_search','Oracle Ultra Search', +'orangebot', 'OrangeBot, no website, log entry specifies mail address', # support.orangebot@orange.com 'orbiter','Orbiter', 'yodaobot','OutfoxBot/YodaoBot', 'qihoobot','QihooBot', @@ -1725,7 +1844,9 @@ 'pear_http_request_class','PEAR HTTP Request class', 'peerbot','PEERbot', 'perman', 'Perman surfer', -'php[_+ ]version[_+ ]tracker','PHP version tracker', +'php[_+\s]version[_+\s]tracker','PHP version tracker', +'phpcrawl', 'PHPCrawl', +'picmole', 'Specified address www.picmole.com was not reachable on April 21, 2014', 'pictureofinternet','PictureOfInternet', 'ping\.blo\.gs','ping.blo.gs', 'plinki','plinki', @@ -1737,29 +1858,47 @@ 'postfavorites','PostFavorites', 'projectwf\-java\-test\-crawler','ProjectWF-java-test-crawler', 'proodlebot','proodleBot', +'publiclibraryarchive', 'publiclibraryarchive.org (related to spiderlytics.com and/or waybackarchive.org?)', +#Observations 2014-06-23 +#Domain publiclibraryarchive.org is parked at GoDaddy.com +#from https://www.projecthoneypot.org/ +#81.30.151.220's User Agent Strings (honeypot classified this ip as an mail server, active about 6 years ago) +#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) +#176.9.138.27's User Agent Strings +#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) +#Mozilla/5.0 (compatible; Spiderlytics/1.0; +spider@spiderlytics.com) +#Mozilla/5.0 (compatible; waybackarchive.org/1.0; +spider@waybackarchive.org) +#146.0.32.165's User Agent Strings +#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) +#Mozilla/5.0 (compatible; savetheworldheritage.org/1.0; +crawl@savetheworldheritage.org) 'pyquery','PyQuery', 'rambler','StackRambler', 'redalert','Red Alert', 'relevantnoise\.com', 'Relevant Noise', +'rogerbot', 'Rogerbot', 'rojo','RoJo aggregator', 'rssimagesbot','rssImagesBot', 'ruffle','ruffle SemanticWeb crawler', 'rufusbot','RufusBot Rufus Web Miner', 'sandcrawler','SandCrawler (Microsoft)', +'savetheworldheritage', 'savetheworldheritage.org (related to spiderlytics.com, waybackarchive.org and/or publiclibraryarchive.org?)', 'sbider','SBIder', 'schizozilla','Schizozilla', 'scumbot','Scumbot', -'searchguild[_+ ]dmoz[_+ ]experiment','SearchGuild_DMOZ_Experiment', +'searchguild[_+\s]dmoz[_+\s]experiment','SearchGuild_DMOZ_Experiment', 'searchmetricsbot','SearchmetricsBot', 'seekbot','Seekbot', 'semrushbot', 'SemrushBot', 'sensis_web_crawler','Sensis Web Crawler', +'seodiver', 'SEO DIVER', 'seokicks\.de', 'SEOkicks Webcrawler', 'seznambot','SeznamBot', 'shim\-crawler','Shim-Crawler', 'shoutcast','Shoutcast Directory Service', +'sitedomain-bot', 'Sitedomain.de', 'siteexplorer\.info', 'Site Explorer', 'slysearch','SlySearch', +'smtbot', 'SMTBot', 'snap\.com_beta_crawler','snap.com beta crawler', 'sohu\-search','sohu-search', 'sohu','sohu agent', @@ -1771,6 +1910,7 @@ 'ssearch_bot', 'sSearch Crawler', 'steroid__download','STEROID Download', 'steeler','Steeler', +'stq_bot', 'SEARCHTEQ', 'suchfin\-bot','Suchfin-Bot', 'superbot','SuperBot', 'surveybot','SurveyBot', @@ -1785,7 +1925,7 @@ 'testbot','TestBot', 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','T-H-U-N-D-E-R-S-T-O-N-E', 'topicblogs', 'topicblogs', -'turnitinbot','Turn It In', +'turnitinbot', 'Turn It In', 'turtle', 'Turtle', 'turtlescanner', 'Turtle', 'tutorgigbot','TutorGigBot', @@ -1805,7 +1945,7 @@ 'vortex','VORTEX', 'vse\/','VSE', 'w3c\-checklink','W3C Link Checker', -'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', 'W3C jigsaw CSS Validator', +'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', 'W3C jigsaw CSS Validator', 'w3c_validator','W3C Validator', 'watchmouse', 'WatchMouse Website Monitor', 'wavefire','Wavefire', @@ -1820,18 +1960,21 @@ 'webfilter','WebFilter', 'webindexer','WebIndexer', 'webminer','WebMiner', -'website[_+ ]monitoring[_+ ]bot','Website_Monitoring_Bot', +'website[_+\s]monitoring[_+\s]bot','Website_Monitoring_Bot', 'webvulncrawl', 'WebVulnCrawl', 'wells_search','Wells Search', 'wesee:search', 'WeSEE Bot', +'wevikabot', 'WeViKa', 'wonderer', 'Web Wombat Redback Spider', +'wotbox', 'Wotbox', 'wume_crawler','wume crawler', 'wwweasel',,'WWWeasel', 'xenu\'s_link_sleuth','Xenu Link Sleuth', 'xenu_link_sleuth','Xenu Link Sleuth', 'xirq','xirq', +'xovibot', 'XoviBot', 'y!j', 'Y!J Yahoo Japan', -'yacy','yacy', +'yacy', 'YaCy', 'yahoo\-blogs','Yahoo-Blogs', 'yahoo\-verticalcrawler', 'Yahoo Vertical Crawler', 'yahoofeedseeker', 'Yahoo Feed Seeker', @@ -1866,8 +2009,8 @@ 'scanner', 'Unknown robot (identified by \'scanner\')', 'spider', 'Unknown robot (identified by \'spider\')', 'sucker', 'Unknown robot (identified by \'sucker\')', -'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot\' followed by a space or one of the following characters _+:,.;/\-)', -'[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'bot\' preceded by a space or one of the following characters _+:,.;/\-)', +'bot[\s_+:,\.\;\/\\\-]', 'Unknown robot (identified by \'bot\' followed by a space or one of the following characters _+:,.;/\-)', +'[\s_+:,\.\;\/\\\-]bot', 'Unknown robot (identified by a space or one of the characters _+:,.;/\- followed by \'bot\')', 'curl', 'Common *nix tool for automating web document retireval. Most likely a bot.', 'php', 'A PHP script', 'ruby\/', 'Ruby script', @@ -1911,13 +2054,13 @@ 'microsoft\-webdav\-miniredir', 'microsoft-webdav-miniredir', '^motorola$', 'Suspected Bot masquerading as "Motorola"', 'movabletype', 'movabletype', -'^mozilla\/3\.0 \(compatible$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/3\.0\s\(compatible$', 'Suspected bot masqurading as Mozilla', '^mozilla\/4\.0$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/4\.0 \(compatible;\)$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/4\.0\s\(compatible;\)$', 'Suspected bot masqurading as Mozilla', '^mozilla\/5\.0$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 \(compatible;$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 \(en\-us\)$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0 firefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\s\(compatible;$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\s\(en\-us\)$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\sfirefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', '^msie', 'Suspected bot masquerading as M$ IE', 'netnewswire', 'netnewswire', ' netseer ', 'Net Seer', @@ -1990,7 +2133,7 @@ 'fast\-webcrawler'=>'AllTheWeb', 'googlebot'=>'Google', 'google\-sitemap'=>'Google', -'google[_+ ]web[_+ ]preview'=>'Google', +'google[_+\s]web[_+\s]preview'=>'Google', 'msnbot'=>'MSN', 'nutch'=>'Looksmart', 'scooter'=>'AltaVista', diff --git a/wwwroot/cgi-bin/lib/search_engines.pm b/wwwroot/cgi-bin/lib/search_engines.pm index a8dea8b4..f49fcc28 100644 --- a/wwwroot/cgi-bin/lib/search_engines.pm +++ b/wwwroot/cgi-bin/lib/search_engines.pm @@ -14,6 +14,8 @@ # Maybe use a search string without a slash, and - if necessary - # an entry in %NotSearchEnginesKeys , if this search string # matches entries that are not search engines. +# Example of a web address of a Amazon search engine: +# http://www.amazon.de/gp/bit/apps/web/SERP/search/ref=bit_bds-p24_serp_cr_de?ie=UTF8tagbase=bds-p24&query=deutsch+8.+klasse+gymnasium+protokoll # (b) A unique string to identify the search engine within AWStats # (c) A regular expression that finds the start of the query part in the # referrer string @@ -216,6 +218,7 @@ 'googlecom\.com', 'goggle\.co\.hu', '216\.239\.32\.20', +'173\.194\.32\.223', '216\.239\.(35|37|39|51)\.100', '216\.239\.(35|37|39|51)\.101', '216\.239\.5[0-9]\.104', @@ -323,8 +326,24 @@ 'searchya\.com', 'picsearch\.de', 'webssearches\.com', +'airzip\.inspsearch\.com', 'zapmeta\.de', 'localmoxie\.com', +'search-results\.mobi', +'androidsearch\.com', +'isearch\.nation\.com', +'search\.zonealarm\.com', +'www\.buenosearch\.com', +'search\.foxtab\.com', +'searches\.qone8\.com', +'startpage\.com', +'www\.qwant\.com', +'searches\.safehomepage\.com', +'searches\.vi-view\.com', +'wow\.utop\.it', +'windowssearch\.com', +'www\.wow\.com', +'searches\.globososo\.com', # Chello Portals 'chello\.at', 'chello\.be', @@ -391,6 +410,7 @@ 'search\.fbdownloader\.com', 'search\.babylon\.com', 'my\.allgameshome\.com', +'surfcanyon\.com', # Minor finnish search engines 'haku\.www\.fi', # Minor french search engines @@ -414,7 +434,7 @@ 'suche\.aol\.de', 'www\.startxxl\.com', 'www\.benefind\.de', -'www\.amazon\.de.*search', #Just as a reminder, probably will not work as AWstats seem to consider the host part of an URL only +'www\.amazon\.de.*search', #Just as a reminder, probably will not work as AWstats seems to consider the host part of an URL only 'de\.wow\.com', 'www\.vlips\.de', 'metager\.de', @@ -425,6 +445,11 @@ 'umfis\.de', 'fastbot\.de', 'tixuma\.de', +'suche\.freenet\.de', +'www\.izito\.de', +'extern\.peoplecheck\.de', +'www\.oneseek\.de', +'de\.wiki\.gov\.cn', # Minor Hungarian search engines 'heureka\.hu','vizsla\.origo\.hu','lapkereso\.hu','goliat\.hu','index\.hu','wahoo\.hu','webmania\.hu','search\.internetto\.hu', 'tango\.hu', @@ -440,6 +465,7 @@ 'search\.genieo\.com', # Minor Japanese search engines 'ask\.jp','sagool\.jp', +'websearch\.rakuten\.co\.jp', # Minor Norwegian search engines 'sok\.start\.no', 'eniro\.no', # Minor Polish search engines @@ -455,6 +481,8 @@ 'sapo\.pt', # Minor swiss search engines 'search\.ch', 'search\.bluewin\.ch', +'www\.zapmeta\.ch', +'etools\.ch', # Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines 'pogodak\.' ); @@ -495,6 +523,7 @@ 'googlecom\.com','google', 'goggle\.co\.hu','google', '216\.239\.32\.20', 'google', +'173\.194\.32\.223', 'google', '216\.239\.(35|37|39|51)\.100','google_cache', '216\.239\.(35|37|39|51)\.101','google_cache', '216\.239\.5[0-9]\.104','google_cache', @@ -600,8 +629,24 @@ 'searchya\.com', 'searchya', 'picsearch\.de', 'picsearch', 'webssearches\.com', 'webssearches', +'airzip\.inspsearch\.com', 'inspsearch_com', 'zapmeta\.de', 'zapmeta', 'localmoxie\.com', 'localmoxie', +'search-results\.mobi', 'search-results_mobi', +'androidsearch\.com', 'androidsearch', +'isearch\.nation\.com', 'isearch_nation_com', +'search\.zonealarm\.com', 'search_zonealarm_com', +'www\.buenosearch\.com', 'www_buenosearch_com', +'search\.foxtab\.com', 'search_foxtab_com', +'searches\.qone8\.com', 'searches_qone8_com', +'startpage\.com', 'startpage_com', +'www\.qwant\.com', 'qwant_com', +'searches\.safehomepage\.com', 'safehomepage_com', +'searches\.vi-view\.com', 'vi-view_com', +'wow\.utop\.it', 'wow_utop_it', +'windowssearch\.com', 'windowssearch_com', +'www\.wow\.com', 'www_wow_com', +'searches\.globososo\.com', 'globososo_com', # Chello Portals 'chello\.at','chelloat', 'chello\.be','chellobe', @@ -694,6 +739,7 @@ 'search\.fbdownloader\.com','fbdownloader', 'search\.babylon\.com', 'babylon', 'my\.allgameshome\.com', 'allgameshome', +'surfcanyon\.com', 'surfcanyon_com', # Minor finnish search engines 'haku\.www\.fi','haku', # Minor french search engines @@ -741,6 +787,11 @@ 'umfis\.de', 'umfis', 'fastbot\.de', 'fastbot_de', 'tixuma\.de', 'tixuma_de', +'suche\.freenet\.de', 'freenet_de', +'www\.izito\.de', 'izito_de', +'extern\.peoplecheck\.de', 'peoplecheck_de', +'www\.oneseek\.de', 'oneseek_de', +'de\.wiki\.gov\.cn', 'de_wiki_gov_cn', # Minor Hungarian search engines 'heureka\.hu','heureka', 'vizsla\.origo\.hu','origo', @@ -773,6 +824,7 @@ # Minor Japanese search engines 'ask\.jp','askjp', 'sagool\.jp','sagool', +'websearch\.rakuten\.co\.jp', 'rakuten', # Minor Norwegian search engines 'sok\.start\.no','start', 'eniro\.no','eniro', # Minor Polish search engines @@ -806,6 +858,8 @@ # Minor swiss search engines 'search\.ch','searchch', 'search\.bluewin\.ch','bluewin', +'www\.zapmeta\.ch', 'zapmeta_ch', +'etools\.ch', 'etools_ch', # Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines 'pogodak\.','pogodak', # Generic search engines @@ -818,7 +872,8 @@ #------------------------------------------------------------------------------ %SearchEnginesWithKeysNotInQuery=( 'a9',1, # www.a9.com/searchkey1%20searchkey2 -'iminent',1 #http://start.iminent.com/StartWeb/1031/toolbox/#q=searchkey1%20searchkey2&additional_arguments +'iminent',1, #http://start.iminent.com/StartWeb/1031/toolbox/#q=searchkey1%20searchkey2&additional_arguments +'de_wiki_gov_cn',1 #http://de.wiki.gov.cn/s_searchkey1%20searchkey2 ); # SearchEnginesKnownUrl @@ -928,8 +983,24 @@ 'searchya', 'q=', 'picsearch', 'q=', 'webssearches', 'q=', +'inspsearch_com', 'q=', 'zapmeta', 'query=', 'localmoxie', 'keyword=', +'search-results_mobi', 'q=', +'androidsearch', 'q=', +'isearch_nation_com', 'q=', +'search_zonealarm_com', 'q=', +'www_buenosearch_com', 'q=', +'search_foxtab_com', 'q=', +'searches_qone8_com', 'q=', +'startpage_com', 'query=', +'qwant_com', 'q=', +'safehomepage_com', 'q=', +'vi-view_com', 'q=', +'wow_utop_it', 'q=', +'windowssearch_com', 'q=', +'www_wow_com', 'q=', +'globososo_com', 'q=', # Chello Portals 'chelloat','q1=', 'chellobe','q1=', @@ -997,6 +1068,7 @@ 'fbdownloader','q=', 'babylon','q=', 'allgameshome', 's=', +'surfcanyon_com', 'q=', # Minor finnish search engines 'haku','w=', # Minor french search engines @@ -1032,6 +1104,11 @@ 'umfis', 'suchbegriff=', 'fastbot_de', 'red=[0-9]*\+', 'tixuma_de', 'sc=', +'freenet_de', 'query=', +'izito_de', 'q=', +'peoplecheck_de', 'q=', +'oneseek_de', 'q=', +'de_wiki_gov_cn', 'de\.wiki\.gov\.cn\/s_', # Minor Hungarian search engines 'heureka','heureka=', 'origo','(q|search)=', 'goliat','KERESES=', 'wahoo','q=', 'internetto','searchstr=', 'keresolap_hu','q=', @@ -1057,6 +1134,7 @@ # Minor Japanese search engines 'askjp','(ask|q)=', 'sagool','q=', +'rakuten', 'qt=', # Minor Norwegian search engines 'start','q=', 'eniro','q=', # Minor Polish search engines @@ -1081,6 +1159,8 @@ 'enirose', 'hitta:', #Not sure if this works, as the keywords are part of the URL, and therefore the URL does not contain a question mark. # Minor swiss search engines 'searchch', 'q=', 'bluewin', 'qry=', +'zapmeta_ch', 'query=', +'etools_ch', 'query=', # Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines 'pogodak', 'q=' ); @@ -1217,8 +1297,24 @@ 'searchya', 'Searchya', 'picsearch', 'picsearch', 'webssearches', 'Web Searches', +'inspsearch_com', 'airzip.inspsearch.com (related to http://www.webssearches.com/?)', 'zapmeta', 'ZapMeta', 'localmoxie', 'Local Moxie', +'search-results_mobi', 'search-results.mobi', +'androidsearch', 'androidsearch.com', +'isearch_nation_com', 'Nation Search', +'search_zonealarm_com', 'Zone Alarm Search', +'www_buenosearch_com', 'BuenoSearch', +'search_foxtab_com', 'Foxtab Search', +'searches_qone8_com', 'Omiga-Plus', +'startpage_com', 'Startpage', +'qwant_com', 'qwant.com', +'safehomepage_com', 'safehomepage.com', +'vi-view_com', 'vi-view.com', +'wow_utop_it', 'wow.utop.it', +'windowssearch_com', 'windowssearch.com', +'www_wow_com', 'WOW.com', +'globososo_com', 'Globososo', # Chello Portals 'chelloat','Chello Austria', 'chellobe','Chello Belgium', @@ -1287,6 +1383,7 @@ 'fbdownloader','FBDownloader', 'babylon','Babylon', 'allgameshome', 'AllGamesHome', +'surfcanyon_com', 'SurfCanyon', # Minor finnish search engines 'haku','Ihmemaa', # Minor french search engines @@ -1297,8 +1394,10 @@ # Minor German search engines 'aolde','AOL (de)', 'o2aolde', 'o2 Suche', -'fireball','Fireball', 'infoseek','Infoseek', 'webde','Web.de', -'abacho','Abacho', 't-online','T-Online', +'fireball','Fireball', 'infoseek','Infoseek', +'webde','Web.de', +'abacho','Abacho', +'t-online','T-Online', 'allesklar','allesklar.de', 'meinestadt','meinestadt.de', 'metaspinner','metaspinner', 'metacrawler_de','metacrawler.de', @@ -1306,7 +1405,7 @@ 'netluchs','Netluchs', 'schoenerbrausen','Schoenerbrausen/', 'gmxsuche', 'GMX Suche', -'gmxsuche_at', 'GMX Suche �sterreich', +'gmxsuche_at', 'GMX Suche Oesterreich', 'ecosiasearch', 'Ecosia Search', 'aolsearch', 'AOL Search', 'aolsuche', 'AOL Suche', @@ -1323,6 +1422,11 @@ 'umfis', 'UMFIS-Online Das Umweltfirmen-Informationssystem der IHKs in Deutschland', 'fastbot_de', 'Fastbot.de (Does not provide search keyphrases, using found page instead)', 'tixuma_de', 'Tixuma Deutschland', +'freenet_de', 'suche.freenet.de', +'izito_de', 'iZito Deutschland', +'peoplecheck_de', 'PeopleCheck.de', +'oneseek_de', 'Metasuchmaschine OneSeek.de', +'de_wiki_gov_cn', 'Wiki Sucher', # Minor hungarian search engines 'heureka','Heureka', 'origo','Origo-Vizsla', 'lapkereso','Startlapkereso', 'goliat','Goliat', 'indexhu','Index', 'wahoo','Wahoo', 'webmania','webmania.hu', 'internetto','Internetto Kereso', 'tango_hu','Tango', @@ -1348,6 +1452,7 @@ # Minor Japanese search engines 'askjp','Ask Japan', 'sagool','Sagool', +'rakuten', 'websearch.rakuten.co.jp', # Minor Norwegian search engines 'start','start.no', 'eniro','Eniro', # Minor polish search engines @@ -1376,6 +1481,8 @@ 'sapo','Sapo', # Minor Swiss search engines 'searchch', 'search.ch', 'bluewin', 'search.bluewin.ch', +'zapmeta_ch', 'ZapMeta.ch', +'etools_ch', 'eTools.ch', # Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines 'pogodak','Pogodak.com', # Generic search engines -- 2.47.2