From: Laurent Destailleur Date: Wed, 11 Nov 2015 19:27:51 +0000 (+0100) Subject: Update bot and search engine files X-Git-Tag: AWSTATS_7_5~28 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ef19886c5592ba0bcc39f27e8f3e8b273334d64f;p=thirdparty%2FAWStats.git Update bot and search engine files --- diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index 13566479..d87f7885 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -37,6 +37,7 @@ # in the user agent string only once or just a few times. Most of the # time a user agent string ist used that does not contain hints that # a bot is involved. An example is the iCjobs spider. +# msnbot-UDiscovery/2.0b seems to show this behaviour too. # # # @@ -713,6 +714,8 @@ 'xenu\slink\ssleuth', 'xget', # Other robots reported by users +'^finbot', #UA string starts with "finbot", should not match "elfinbot" +'^webindex$', #UA should not match "webindexer" '1\-more_scanner', '360spider', 'a6-indexer', @@ -728,6 +731,7 @@ 'alpha_search_agent', 'allrati', 'aport', +'applebot', 'archive\-de\.com', 'archive\.org_bot', 'argus', # Must be before nutch @@ -783,12 +787,14 @@ 'daviesbot', 'daypopbot', 'deepindex', +'deusu', 'dipsie\.bot', 'dnsgroup', 'doccheckbot', 'domainchecker', 'domainsdb\.net', 'dotbot', +'duckduckgo-favicons-bot', 'dulance', 'dumbot', 'dumm\.de\-bot', @@ -834,6 +840,7 @@ 'g2crawler', 'gaisbot', 'geniebot', +'genieo', 'gigablastopensource', 'gigabot', 'girafabot', @@ -859,6 +866,7 @@ 'ichiro', 'idmarch', 'iltrovatore\-setaccio', +'implisensebot', 'infobot', 'infociousbot', 'infohelfer', @@ -876,6 +884,7 @@ 'iupui_research_bot', 'izsearch', 'james\sbot', +'jobboerse', #AWStats seems not to find this one despite the fact that "JobboerseBot" and "jobboerse.com" appear in the UA-string, maybe some previous entry matches 'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility', 'justview', 'kalambot', @@ -915,14 +924,17 @@ 'meanpathbot', 'mediabot', 'mediapartners\-google', +'megaindex', 'megite', 'memorybot', 'metager2-verification-bot', +'metajobbot', #Does not show up in the results of Sep. 2015 despite the fact that the corresponing log file has about 40 entries containing "MetaJobBot" in the UA string - strange. 'metaspinner', 'miadev', -'microsoft bits', +'microsoft\sbits', 'microsoft.*discovery', # = 'microsoft (?:office (?:protocol|existence)|data access internet publishing provider protocol) discovery', 'microsoft[_+\s]url[_+\s]control', +'mindupbot', 'mini\-reptile', 'minirank', 'missigua_locator', @@ -940,7 +952,7 @@ 'mydoyouhike', 'nagios', 'nasa_search', -'netestate ne crawler', +'netestate\sne\scrawler', 'netluchs', 'netsprint', 'newsgatoronline', @@ -959,11 +971,13 @@ 'onfolio', 'opentaggerbot', 'openwebspider', +'optimizer', 'oracle_ultra_search', 'orangebot', 'orbiter', 'yodaobot', 'qihoobot', +'qwantify', 'passwordmaker\.org', 'pear_http_request_class', 'peerbot', @@ -986,11 +1000,13 @@ 'pyquery', 'rambler', 'redalert', +'riddler', 'rogerbot', 'rojo', 'rssimagesbot', 'ruffle', 'rufusbot', +'safesearch', 'sandcrawler', 'savetheworldheritage', 'sbider', @@ -1003,11 +1019,13 @@ 'sensis_web_crawler', 'seodiver', 'seokicks\.de', +'seoscanners', 'seznambot', 'shim\-crawler', 'shoutcast', 'sitedomain-bot', 'siteexplorer\.info', +'skimbot', 'slysearch', 'smtbot', 'snap\.com_beta_crawler', @@ -1035,6 +1053,7 @@ 'teragramcrawlersurf', 'test_crawler', 'testbot', +'thumbsniper', 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', 'topicblogs', 'turnitinbot', @@ -1051,9 +1070,11 @@ 'ustc\-semantic\-group', 'vagabondo\-wap', 'vagabondo', +'vebidoobot', 'vermut', 'versus_crawler_from_eda\.baykan@epfl\.ch', 'vespa_crawler', +'voltron', 'vortex', 'vse\/', 'w3c\-checklink', @@ -1062,6 +1083,7 @@ 'watchmouse', 'wavefire', 'waybackarchive\.org', +'wbsearchbot', 'webclipping\.com', 'webcompass', 'webcrawl\.net', @@ -1073,6 +1095,7 @@ 'website[_+\s]monitoring[_+\s]bot', 'webvulncrawl', 'wells_search', +'wer-liefert-was', 'wesee:search', 'wevikabot', 'wonderer', @@ -1096,6 +1119,7 @@ 'flexum', 'yanga', 'yet-another-spider', +'yisouspider', 'yooglifetchagent', 'z\-add_link_checker', 'zealbot', @@ -1135,10 +1159,10 @@ 'geohasher', 'hanrss', 'inagist', -'jacobin club', +'jacobin\sclub', 'jakarta', 'js\-kit', -'largesmall crawler', +'largesmall\scrawler', 'linkedinbot', 'longurl', 'metauri', @@ -1156,7 +1180,7 @@ '^msie', # End of hiding bots. 'netnewswire', -' netseer ', +'\snetseer\s', 'netvibes', 'newrelicpinger', 'newsfox', @@ -1173,7 +1197,7 @@ 'r6\_', 'ratingburner', 'regator', -'rome client', +'rome\sclient', 'rpt\-httpclient', 'rssgraffiti', 'sage\+\+', @@ -1189,7 +1213,7 @@ 'trapit', 'trileet', 'tweetedtimes', -'twisted pagegetter', +'twisted\spagegetter', 'twitterbot', 'twitterfeed', 'unwindfetchor', @@ -1197,12 +1221,12 @@ 'windows\-rss\-platform', 'wiumi', 'xydo', -'yahoo! slurp', -'yahoo pipes', +'yahoo!\sslurp', +'yahoo\spipes', 'yahoo\-newscrawler', 'yahoocachesystem', 'yahooexternalcache', -'yahoo! searchmonkey', +'yahoo!\ssearchmonkey', 'yahooysmcm', 'yammer', # 'yandexbot', #already covered by 'yandex' @@ -1284,7 +1308,7 @@ 'jennybot','JennyBot', 'mercator','Mercator', 'msnbot\-media','MSNBot-media', -'msnbot-udiscovery', 'msnbot-UDiscovery Note: Most traffic counts as user traffic', +'msnbot-udiscovery', 'msnbot-UDiscovery Note: AWStats counts most of its traffic as user traffic', 'msnbot','MSNBot', 'netcraft','Netcraft', 'petersnews','Petersnews', @@ -1591,6 +1615,8 @@ 'xenu\slink\ssleuth', 'Xenu'. "'" . 's Link Sleuth (TM), see Wikipedia', 'xget','XGET', # Other robots reported by users +'^finbot', 'finbot', +'^webindex$', 'WebIndex', '1\-more_scanner','1-More Scanner', '360spider','360spider', 'a6-indexer', 'A6-Indexer', @@ -1606,6 +1632,7 @@ 'alpha_search_agent','Alpha Search Agent', 'allrati','Allrati', 'aport', 'Aport', +'applebot', 'Applebot', 'archive\-de\.com', 'Archive-de.com', 'archive\.org_bot','archive.org bot', 'argus','Argus', @@ -1660,12 +1687,14 @@ 'daviesbot', 'DaviesBot', 'daypopbot', 'DayPop', 'deepindex','Deepindex', +'deusu', 'DeuSu', 'dipsie\.bot','Dipsie', 'dnsgroup','DNSGroup', 'doccheckbot', 'doccheckbot/1.0, known to Project Honey Pot', 'domainchecker','DomainChecker', 'domainsdb\.net','DomainsDB.net', 'dotbot', 'DotBot, Open Site Explorer', +'duckduckgo-favicons-bot', 'DuckDuckGo-Favicons-Bot', 'dulance','Dulance', 'dumbot','Dumbot', 'dumm\.de\-bot','dumm.de-Bot', @@ -1711,6 +1740,7 @@ 'g2crawler','G2Crawler', 'gaisbot','Gaisbot', 'geniebot','Geniebot', +'genieo', 'Genieo', 'gigablastopensource', 'GigablastOpenSource, an Open Source Search Engine(Wiki)', 'gigabot','GigaBot', 'girafabot','Girafabot', @@ -1736,6 +1766,7 @@ 'ichiro','ichiro', 'idmarch', 'IDMARCH', 'iltrovatore\-setaccio','IlTrovatore-Setaccio', +'implisensebot', 'ImplisenseBot', 'infobot','InfoBot', 'infociousbot','InfociousBot', 'infohelfer','Infohelfer', @@ -1752,6 +1783,7 @@ 'iupui_research_bot','IUPUI_Research_Bot', 'izsearch', 'iZSearch', 'james\sbot', 'James BOT', +'jobboerse', 'Jobbörse', 'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility','JRTwine_Software_Check_Favorites_Utility', 'justview', 'JustView', 'kalambot','KalamBot', @@ -1789,16 +1821,19 @@ 'mediapartners\-google','Google AdSense', # 'Mediapartners-Google (Feb 12, 2015: no additial information in UA String, seems to use GigablastOpenSource', # Uses UA string "Mediapartners-Google" only, and there were accesses using an UA string "GigablastOpenSource/1.0" from the same IP-Address. -# Therefore this is probably not related to Google 4.3.2015 Albrecht M�ller +# Therefore this is probably not related to Google 4.3.2015 Albrecht Müller +'megaindex', 'MegaIndex Crawler, seems to belong to MegaIndex.ru', 'megite','Megite', 'memorybot', 'Archivethe.net', 'metager2-verification-bot', 'metager2-verification-bot', 'metager\-linkchecker','MetaGer LinkChecker', +'metajobbot', 'MetaJobBot', 'metaspinner','Metaspinner', 'miadev', 'MiaDev spider', -'microsoft bits', 'Microsoft Background Intelligent Transfer Service (BITS)?', +'microsoft\sbits', 'Microsoft Background Intelligent Transfer Service (BITS)?', 'microsoft.*discovery', 'Microsoft Office Protocol Discovery/Microsoft Office Existence Discovery', 'microsoft[_+\s]url[_+\s]control','Microsoft URL Control', +'mindupbot', 'mindUpBot (datenbutler.de)', 'minirank','miniRank', 'mini\-reptile','Mini-reptile', 'missigua_locator','Missigua_Locator', @@ -1816,7 +1851,7 @@ 'mydoyouhike','Mydoyouhike', 'nagios','Nagios', 'nasa_search','NASA Search', -'netestate ne crawler','Website-Datenbank', +'netestate\sne\scrawler','Website-Datenbank', 'netluchs','Netluchs', 'netsprint','NetSprint', 'newsgatoronline', 'NewsGator Online', @@ -1835,11 +1870,13 @@ 'onfolio','Onfolio', 'opentaggerbot','OpenTaggerBot', 'openwebspider','OpenWebSpider', +'optimizer', 'Optimizer', 'oracle_ultra_search','Oracle Ultra Search', 'orangebot', 'OrangeBot, no website, log entry specifies mail address', # support.orangebot@orange.com 'orbiter','Orbiter', 'yodaobot','OutfoxBot/YodaoBot', 'qihoobot','QihooBot', +'qwantify', 'Qwant', 'passwordmaker\.org','passwordmaker.org', 'pear_http_request_class','PEAR HTTP Request class', 'peerbot','PEERbot', @@ -1871,15 +1908,18 @@ #146.0.32.165's User Agent Strings #Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) #Mozilla/5.0 (compatible; savetheworldheritage.org/1.0; +crawl@savetheworldheritage.org) +#Mozilla/5.0 (compatible; seoscanners.net/1; +spider@seoscanners.net) 'pyquery','PyQuery', 'rambler','StackRambler', 'redalert','Red Alert', 'relevantnoise\.com', 'Relevant Noise', +'riddler', 'Riddler', 'rogerbot', 'Rogerbot', 'rojo','RoJo aggregator', 'rssimagesbot','rssImagesBot', 'ruffle','ruffle SemanticWeb crawler', 'rufusbot','RufusBot Rufus Web Miner', +'safesearch', 'Avira SafeSearch', 'sandcrawler','SandCrawler (Microsoft)', 'savetheworldheritage', 'savetheworldheritage.org (related to spiderlytics.com, waybackarchive.org and/or publiclibraryarchive.org?)', 'sbider','SBIder', @@ -1892,11 +1932,13 @@ 'sensis_web_crawler','Sensis Web Crawler', 'seodiver', 'SEO DIVER', 'seokicks\.de', 'SEOkicks Webcrawler', +'seoscanners', 'seoscanners.net (related to publiclibraryarchive.org and savetheworldheritage.org?)', 'seznambot','SeznamBot', 'shim\-crawler','Shim-Crawler', 'shoutcast','Shoutcast Directory Service', 'sitedomain-bot', 'Sitedomain.de', 'siteexplorer\.info', 'Site Explorer', +'skimbot', 'SkimBot', 'slysearch','SlySearch', 'smtbot', 'SMTBot', 'snap\.com_beta_crawler','snap.com beta crawler', @@ -1923,6 +1965,7 @@ 'teragramcrawlersurf','TeragramCrawlerSURF', 'test_crawler','Test Crawler', 'testbot','TestBot', +'thumbsniper', 'ThumbSniper', 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','T-H-U-N-D-E-R-S-T-O-N-E', 'topicblogs', 'topicblogs', 'turnitinbot', 'Turn It In', @@ -1939,9 +1982,11 @@ 'ustc\-semantic\-group','USTC-Semantic-Group', 'vagabondo\-wap','Vagabondo-WAP', 'vagabondo','Vagabondo', +'vebidoobot', 'vebidoobot', 'vermut','Vermut', 'versus_crawler_from_eda\.baykan@epfl\.ch','versus crawler from eda.baykan@epfl.ch', 'vespa_crawler','Vespa Crawler', +'voltron', 'voltron', 'vortex','VORTEX', 'vse\/','VSE', 'w3c\-checklink','W3C Link Checker', @@ -1952,6 +1997,7 @@ 'waybackarchive\.org', 'No website, email: spider(at)waybackarchive.org', # 2.12.2013 Project Honeypot reports at least one of the IPs used by waybackarchive with a spiderlytics UA string. # Problably not related to the wayback machine of archive.org. +'wbsearchbot', 'WBSearchBot', 'webclipping\.com', 'WebClipping.com', 'webcompass', 'webcompass', 'webcrawl\.net','webcrawl.net', @@ -1963,6 +2009,7 @@ 'website[_+\s]monitoring[_+\s]bot','Website_Monitoring_Bot', 'webvulncrawl', 'WebVulnCrawl', 'wells_search','Wells Search', +'wer-liefert-was', 'Wer-liefert-was Crawler Note: AWStats counts most traffic as user traffic', 'wesee:search', 'WeSEE Bot', 'wevikabot', 'WeViKa', 'wonderer', 'Web Wombat Redback Spider', @@ -1986,6 +2033,7 @@ 'flexum', 'Flexum Search Engine', 'yanga', 'Yanga WorldSearch Bot', 'yet-another-spider','Yet-Another-Spider', +'yisouspider', 'YisouSpider (no additional information in UA string)', 'yooglifetchagent','yoogliFetchAgent', 'z\-add_link_checker','Z-Add Link Checker', 'zealbot','ZealBot', @@ -2011,7 +2059,7 @@ 'sucker', 'Unknown robot (identified by \'sucker\')', 'bot[\s_+:,\.\;\/\\\-]', 'Unknown robot (identified by \'bot\' followed by a space or one of the following characters _+:,.;/\-)', '[\s_+:,\.\;\/\\\-]bot', 'Unknown robot (identified by a space or one of the characters _+:,.;/\- followed by \'bot\')', -'curl', 'Common *nix tool for automating web document retireval. Most likely a bot.', +'curl', 'Common *nix tool for automating web document retrieval. Most likely a bot.', 'php', 'A PHP script', 'ruby\/', 'Ruby script', # Additional bots found by Sussex. @@ -2044,10 +2092,10 @@ 'geohasher', 'geohasher', 'hanrss', 'hanrss', 'inagist', 'inagist', -'jacobin club', 'jacobin club', +'jacobin\sclub', 'jacobin club', 'jakarta', 'jakarta', 'js\-kit', 'js-kit', -'largesmall crawler', 'largesmall crawler', +'largesmall\scrawler', 'largesmall crawler', 'linkedinbot', 'linkedinbot', 'longurl', 'longurl', 'metauri', 'metauri', @@ -2063,7 +2111,7 @@ '^mozilla\/5\.0\sfirefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', '^msie', 'Suspected bot masquerading as M$ IE', 'netnewswire', 'netnewswire', -' netseer ', 'Net Seer', +'\snetseer\s', 'Net Seer', 'netvibes', 'netvibes', 'newrelicpinger', 'newrelicpinger', 'newsfox', 'Fox News', @@ -2080,7 +2128,7 @@ 'r6\_', 'Radian 6 Crawler', 'ratingburner', 'ratingburner', 'regator', 'regator', -'rome client', 'rome client', +'rome\sclient', 'rome client', 'rpt\-httpclient', 'rpt-httpclient', 'rssgraffiti', 'rssgraffiti', 'sage\+\+', 'sage++', @@ -2096,7 +2144,7 @@ 'trapit', 'trapit', 'trileet', 'trileet', 'tweetedtimes', 'The Tweeted Times', -'twisted pagegetter', 'twisted pagegetter', +'twisted\spagegetter', 'twisted pagegetter', 'twitterbot', 'twitterbot', 'twitterfeed', 'twitterfeed', 'unwindfetchor', 'unwindfetchor', @@ -2104,12 +2152,12 @@ 'windows\-rss\-platform', 'windows-rss-platform', 'wiumi', 'wiumi', 'xydo', 'xydo', -'yahoo! slurp', 'Additional Yahoo bots.', -'yahoo pipes', 'Additional Yahoo bots.', +'yahoo!\sslurp', 'Additional Yahoo bots.', +'yahoo\spipes', 'Additional Yahoo bots.', 'yahoo\-newscrawler', 'Additional Yahoo bots.', 'yahoocachesystem', 'Additional Yahoo bots.', 'yahooexternalcache', 'Additional Yahoo bots.', -'yahoo! searchmonkey', 'Additional Yahoo bots.', +'yahoo!\ssearchmonkey', 'Additional Yahoo bots.', 'yahooysmcm', 'Additional Yahoo bots.', 'yammer', 'yammer', #'yandexbot', 'yandexbot', #already covered by 'yandex' @@ -2153,12 +2201,12 @@ 'bingbot'=>'MSN', 'twitterbot'=>'Twitter', 'twitterfeed'=>'Twitter', -'yahoo! slurp'=>'Yahoo', -'yahoo pipes'=>'Yahoo', +'yahoo!\sslurp'=>'Yahoo', +'yahoo\spipes'=>'Yahoo', 'yahoo-newscrawler'=>'Yahoo', 'yahoocachesystem'=>'Yahoo', 'yahooexternalcache'=>'Yahoo', -'yahoo! searchmonkey'=>'Yahoo', +'yahoo!\ssearchmonkey'=>'Yahoo', 'yahooysmcm'=>'Yahoo' ); diff --git a/wwwroot/cgi-bin/lib/search_engines.pm b/wwwroot/cgi-bin/lib/search_engines.pm index a174589d..9899ff0c 100644 --- a/wwwroot/cgi-bin/lib/search_engines.pm +++ b/wwwroot/cgi-bin/lib/search_engines.pm @@ -344,6 +344,9 @@ 'windowssearch\.com', 'www\.wow\.com', 'searches\.globososo\.com', +'swisscows\.ch', +'globososo\.com', +'preciobarato\.xyz', # Chello Portals 'chello\.at', 'chello\.be', @@ -408,9 +411,12 @@ # Minor english search engines '(^|\.)ask\.co\.uk','bbc\.co\.uk/cgi-bin/search','ifind\.freeserve','looksmart\.co\.uk','splut\.','spotjockey\.','ukdirectory\.','ukindex\.co\.uk','ukplus\.','searchy\.co\.uk', 'search\.fbdownloader\.com', +'search\.fdownloadr\.com', 'search\.babylon\.com', 'my\.allgameshome\.com', 'surfcanyon\.com', +'uk\.foxstart\.com', +'yandex\.com', # Minor finnish search engines 'haku\.www\.fi', # Minor french search engines @@ -450,6 +456,9 @@ 'extern\.peoplecheck\.de', 'www\.oneseek\.de', 'de\.wiki\.gov\.cn', +'umuwa\.de', +'suche\.1und1\.de', +'www\.metasuche\.ch', # Minor Hungarian search engines 'heureka\.hu','vizsla\.origo\.hu','lapkereso\.hu','goliat\.hu','index\.hu','wahoo\.hu','webmania\.hu','search\.internetto\.hu', 'tango\.hu', @@ -648,6 +657,9 @@ 'windowssearch\.com', 'windowssearch_com', 'www\.wow\.com', 'www_wow_com', 'searches\.globososo\.com', 'globososo_com', +'swisscows\.ch', 'swisscows_ch', +'globososo\.com', 'globososo_com', +'preciobarato\.xyz', 'preciobarato_xyz', # Chello Portals 'chello\.at','chelloat', 'chello\.be','chellobe', @@ -738,9 +750,12 @@ 'ukplus\.','ukplus', 'searchy\.co\.uk','searchy', 'search\.fbdownloader\.com','fbdownloader', +'search\.fdownloadr\.com', 'fdownloadr_com', 'search\.babylon\.com', 'babylon', 'my\.allgameshome\.com', 'allgameshome', 'surfcanyon\.com', 'surfcanyon_com', +'uk\.foxstart\.com', 'uk_foxstart_com', +'yandex\.com', 'yandex_com', # Minor finnish search engines 'haku\.www\.fi','haku', # Minor french search engines @@ -793,6 +808,9 @@ 'extern\.peoplecheck\.de', 'peoplecheck_de', 'www\.oneseek\.de', 'oneseek_de', 'de\.wiki\.gov\.cn', 'de_wiki_gov_cn', +'umuwa\.de', 'umuwa_de', +'suche\.1und1\.de', '1und1_de', +'www\.metasuche\.ch', 'metasuche_ch', # Minor Hungarian search engines 'heureka\.hu','heureka', 'vizsla\.origo\.hu','origo', @@ -874,7 +892,9 @@ %SearchEnginesWithKeysNotInQuery=( 'a9',1, # www.a9.com/searchkey1%20searchkey2 'iminent',1, #http://start.iminent.com/StartWeb/1031/toolbox/#q=searchkey1%20searchkey2&additional_arguments -'de_wiki_gov_cn',1 #http://de.wiki.gov.cn/s_searchkey1%20searchkey2 +'de_wiki_gov_cn',1, #http://de.wiki.gov.cn/s_searchkey1%20searchkey2 +'umuwa_de', 1, #http://umuwa.de/searchkey or http://umuwa.de/searchkey/Images +'amazonsearch', 1 #http://www.amazon.de/gp/bit/apps/web/SERP/search/ref=bit_bds-p24_serp_cr_de?ie=UTF8tagbase=bds-p24&query=deutsch+8.+klasse+gymnasium+protokoll ); # SearchEnginesKnownUrl @@ -1002,6 +1022,9 @@ 'windowssearch_com', 'q=', 'www_wow_com', 'q=', 'globososo_com', 'q=', +'swisscows_ch', 'query=', +'globososo_com', 'q=', +'preciobarato_xyz', 's=', # Chello Portals 'chelloat','q1=', 'chellobe','q1=', @@ -1067,9 +1090,12 @@ 'askuk','(ask|q)=', 'bbc','q=', 'freeserve','q=', 'looksmartuk','key=', 'splut','pattern=', 'spotjockey','Search_Keyword=', 'ukindex', 'stext=', 'ukdirectory','k=', 'ukplus','search=', 'searchy', 'search_term=', 'fbdownloader','q=', +'fdownloadr_com', 'q=', 'babylon','q=', 'allgameshome', 's=', 'surfcanyon_com', 'q=', +'uk_foxstart_com', 'q=', +'yandex_com', 'text=', # Minor finnish search engines 'haku','w=', # Minor french search engines @@ -1110,6 +1136,9 @@ 'peoplecheck_de', 'q=', 'oneseek_de', 'q=', 'de_wiki_gov_cn', 'de\.wiki\.gov\.cn\/s_', +'umuwa_de', 'umuwa\.de\/', +'1und1_de', 'q=', +'metasuche_ch', 'q=', # Minor Hungarian search engines 'heureka','heureka=', 'origo','(q|search)=', 'goliat','KERESES=', 'wahoo','q=', 'internetto','searchstr=', 'keresolap_hu','q=', @@ -1196,43 +1225,43 @@ #------------------------------------------------------------------------------ %SearchEnginesHashLib=( # Major international search engines -'alexa','Alexa', -'alltheweb','AllTheWeb', -'altavista','AltaVista', -'a9', 'A9', -'dmoz','DMOZ', -'google_products','Google (Products)', -'google_base','Google (Base)', -'google_froogle','Froogle (Google)', -'google_groups','Google (Groups)', -'google_image','Google (Images)', -'google_cache','Google (cache)', -'google','Google', -'lycos','Lycos', -'msn','Microsoft MSN Search', -'live','Microsoft Windows Live', -'bing','Microsoft Bing', -'netscape','Netscape', -'aol','AOL', -'terra','Terra', -'tiscali','Tiscali', -'voila','Voila', -'search.com','Search.com', -'yahoo_mindset','Yahoo! Mindset', -'yahoo','Yahoo!', -'sympatico','Sympatico', -'excite','Excite', +'alexa','Alexa', +'alltheweb','AllTheWeb', +'altavista','AltaVista', +'a9', 'A9', +'dmoz','DMOZ', +'google_products','Google (Products)', +'google_base','Google (Base)', +'google_froogle','Froogle (Google)', +'google_groups','Google (Groups)', +'google_image','Google (Images)', +'google_cache','Google (cache)', +'google','Google', +'lycos','Lycos', +'msn','Microsoft MSN Search', +'live','Microsoft Windows Live', +'bing','Microsoft Bing', +'netscape','Netscape', +'aol','AOL', +'terra','Terra', +'tiscali','Tiscali', +'voila','Voila', +'search.com','Search.com', +'yahoo_mindset','Yahoo! Mindset', +'yahoo','Yahoo!', +'sympatico','Sympatico', +'excite','Excite', # Minor international search engines -'google4counter','4-counter (Google)', -'att','AT&T search (powered by Google)', -'bungeebonesdotcom','BungeeBones', +'google4counter','4-counter (Google)', +'att','AT&T search (powered by Google)', +'bungeebonesdotcom','BungeeBones', 'go','Go.com', -'askde','Ask Deutschland', -'askes','Ask España', # break out Ask country specific engines. -'askfr','Ask France', -'askit','Ask Italia', -'asknl','Ask Nederland', -'ask','Ask', +'askde','Ask Deutschland', +'askes','Ask España', # break out Ask country specific engines. +'askfr','Ask France', +'askit','Ask Italia', +'asknl','Ask Nederland', +'ask','Ask', 'atomz','Atomz', 'dejanews','DejaNews', 'euroseek','Euroseek', @@ -1293,29 +1322,32 @@ 'sweetpacks', 'Sweetpacks', 'searchgol', 'Search-Gol', 'duckduckgo', 'DuckDuckGo (Does not provide search keyphrases, using found page instead)', -'facemoods', 'Facemoods Search', -'shoppstop', 'ShoppStop', -'searchya', 'Searchya', -'picsearch', 'picsearch', -'webssearches', 'Web Searches', -'inspsearch_com', 'airzip.inspsearch.com (related to http://www.webssearches.com/?)', -'zapmeta', 'ZapMeta', -'localmoxie', 'Local Moxie', -'search-results_mobi', 'search-results.mobi', -'androidsearch', 'androidsearch.com', -'isearch_nation_com', 'Nation Search', -'search_zonealarm_com', 'Zone Alarm Search', -'www_buenosearch_com', 'BuenoSearch', -'search_foxtab_com', 'Foxtab Search', -'searches_qone8_com', 'Omiga-Plus', -'startpage_com', 'Startpage', -'qwant_com', 'qwant.com', -'safehomepage_com', 'safehomepage.com', -'vi-view_com', 'vi-view.com', -'wow_utop_it', 'wow.utop.it', -'windowssearch_com', 'windowssearch.com', -'www_wow_com', 'WOW.com', -'globososo_com', 'Globososo', +'facemoods', 'Facemoods Search', +'shoppstop', 'ShoppStop', +'searchya', 'Searchya', +'picsearch', 'picsearch', +'webssearches', 'Web Searches', +'inspsearch_com', 'airzip.inspsearch.com (related to http://www.webssearches.com/?)', +'zapmeta', 'ZapMeta', +'localmoxie', 'Local Moxie', +'search-results_mobi', 'search-results.mobi', +'androidsearch', 'androidsearch.com', +'isearch_nation_com', 'Nation Search', +'search_zonealarm_com', 'Zone Alarm Search', +'www_buenosearch_com', 'BuenoSearch', +'search_foxtab_com', 'Foxtab Search', +'searches_qone8_com', 'Omiga-Plus', +'startpage_com', 'Startpage', +'qwant_com', 'qwant.com', +'safehomepage_com', 'safehomepage.com', +'vi-view_com', 'vi-view.com', +'wow_utop_it', 'wow.utop.it', +'windowssearch_com', 'windowssearch.com', +'www_wow_com', 'WOW.com', +'globososo_com', 'Globososo', +'swisscows_ch', 'Swisscows', +'globososo_com', 'Globososo', +'preciobarato_xyz', 'Yandex', # Chello Portals 'chelloat','Chello Austria', 'chellobe','Chello Belgium', @@ -1381,10 +1413,13 @@ 'askuk','Ask UK', 'bbc','BBC', 'freeserve','Freeserve', 'looksmartuk','Looksmart UK', 'splut','Splut', 'spotjockey','Spotjockey', 'ukdirectory','UK Directory', 'ukindex','UKIndex', 'ukplus','UK Plus', 'searchy','searchy.co.uk', -'fbdownloader','FBDownloader', +'fbdownloader','FBDownloader (fbdownloader)', +'fdownloadr_com', 'FBDownloader (fdownloadr)', 'babylon','Babylon', 'allgameshome', 'AllGamesHome', -'surfcanyon_com', 'SurfCanyon', +'surfcanyon_com', 'SurfCanyon', +'uk_foxstart_com', 'Foxstart.com', +'yandex_com', 'Yandex', # Minor finnish search engines 'haku','Ihmemaa', # Minor french search engines @@ -1416,18 +1451,21 @@ 'wowsearch', 'Wow Search', 'vlips_de', 'vlips.de', 'metager', 'MetaGer', -'search_1und1_de', '1&1 Suche', +'search_1und1_de', '1&1 Suche (subdomain "search")', 'smde', 'SM.de - Die SuchMaschine', 'sumaja', 'Sumaja', 'navigationshilfe', 'T-Online Navigationshilfe', -'umfis', 'UMFIS-Online Das Umweltfirmen-Informationssystem der IHKs in Deutschland', -'fastbot_de', 'Fastbot.de (Does not provide search keyphrases, using found page instead)', -'tixuma_de', 'Tixuma Deutschland', -'freenet_de', 'suche.freenet.de', -'izito_de', 'iZito Deutschland', -'peoplecheck_de', 'PeopleCheck.de', -'oneseek_de', 'Metasuchmaschine OneSeek.de', -'de_wiki_gov_cn', 'Wiki Sucher', +'umfis', 'UMFIS-Online Das Umweltfirmen-Informationssystem der IHKs in Deutschland', +'fastbot_de', 'Fastbot.de (Does not provide search keyphrases, using found page instead)', +'tixuma_de', 'Tixuma Deutschland', +'freenet_de', 'suche.freenet.de', +'izito_de', 'iZito Deutschland', +'peoplecheck_de', 'PeopleCheck.de', +'oneseek_de', 'Metasuchmaschine OneSeek.de', +'de_wiki_gov_cn', 'Wiki Sucher', +'umuwa_de', 'Umuwa Deutschland', +'1und1_de', '1&1 Suche (subdomain "suche")', +'metasuche_ch', 'Metasuche.ch', # Minor hungarian search engines 'heureka','Heureka', 'origo','Origo-Vizsla', 'lapkereso','Startlapkereso', 'goliat','Goliat', 'indexhu','Index', 'wahoo','Wahoo', 'webmania','webmania.hu', 'internetto','Internetto Kereso', 'tango_hu','Tango', @@ -1453,7 +1491,7 @@ # Minor Japanese search engines 'askjp','Ask Japan', 'sagool','Sagool', -'rakuten', 'websearch.rakuten.co.jp', +'rakuten', 'websearch.rakuten.co.jp', # Minor Norwegian search engines 'start','start.no', 'eniro','Eniro', # Minor polish search engines @@ -1482,8 +1520,8 @@ 'sapo','Sapo', # Minor Swiss search engines 'searchch', 'search.ch', 'bluewin', 'search.bluewin.ch', -'zapmeta_ch', 'ZapMeta.ch', -'etools_ch', 'eTools.ch', +'zapmeta_ch', 'ZapMeta.ch', +'etools_ch', 'eTools.ch', # Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines 'pogodak','Pogodak.com', # Generic search engines @@ -1522,4 +1560,4 @@ #} #print @SearchEnginesSearchIDOrder_list1." ".@SearchEnginesSearchIDOrder_list2." ".@SearchEnginesSearchIDOrder_listgen; -1; \ No newline at end of file +1;