From: Laurent Destailleur Date: Sat, 28 Dec 2013 12:11:33 +0000 (+0100) Subject: Update database X-Git-Tag: AWSTATS_7_3~29 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=50e2cf75fc672aaa09daf1e2c9dec573bdaa51d9;p=thirdparty%2FAWStats.git Update database --- diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index 877204b3..5459bf2e 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -685,6 +685,7 @@ # Other robots reported by users '1\-more_scanner', '360spider', +'a6-indexer', 'accoona\-ai\-agent', 'activebookmark', 'adamm_bot', @@ -724,6 +725,7 @@ 'bubing', 'bumblebee', 'candlelight[_+ ]favorites[_+ ]inspector', +'careerbot', 'cbn00glebot', 'cerberian_drtrs', 'cfnetwork', @@ -733,6 +735,7 @@ 'computer_and_automation_research_institute_crawler', 'converamultimediacrawler', 'converacrawler', +'copubbot', 'cscrawler', 'cse_html_validator_lite_online', 'cuasarbot', @@ -807,6 +810,7 @@ 'html[_+ ]link[_+ ]validator', 'httrack', 'hundesuche\.com\-bot', +'i-bot', 'ichiro', 'iltrovatore\-setaccio', 'infobot', @@ -858,6 +862,8 @@ 'mediapartners\-google', 'megite', 'metaspinner', +'miadev', +'microsoft bits', 'microsoft.*discovery', # = 'microsoft (?:office (?:protocol|existence)|data access internet publishing provider protocol) discovery', 'microsoft[_+ ]url[_+ ]control', 'mini\-reptile', @@ -926,8 +932,11 @@ 'schizozilla', 'scumbot', 'searchguild[_+ ]dmoz[_+ ]experiment', +'searchmetricsbot', 'seekbot', +'semrushbot', 'sensis_web_crawler', +'seokicks\.de', 'seznambot', 'shim\-crawler', 'shoutcast', @@ -937,6 +946,7 @@ 'sohu\-search', 'sohu', # "sohu agent" 'snappy', +'spbot', 'sphere_scout', 'spiderlytics', 'spip', @@ -982,6 +992,7 @@ 'w3c_validator', 'watchmouse', 'wavefire', +'waybackarchive\.org', 'webclipping\.com', 'webcompass', 'webcrawl\.net', @@ -1496,6 +1507,7 @@ # Other robots reported by users '1\-more_scanner','1-More Scanner', '360spider','360spider', +'a6-indexer', 'A6-Indexer', 'accoona\-ai\-agent','Accoona-AI-Agent', 'activebookmark','ActiveBookmark', 'adamm_bot','AdamM Bot', @@ -1535,6 +1547,7 @@ 'bubing', 'BUbiNG', 'bumblebee', 'Bumblebee (relevare.com)', 'candlelight[_+ ]favorites[_+ ]inspector','Candlelight_Favorites_Inspector', +'careerbot', 'CareerBot', 'cbn00glebot','cbn00glebot', 'cerberian_drtrs','Cerberian Drtrs', 'cfnetwork','CFNetwork', @@ -1544,6 +1557,7 @@ 'computer_and_automation_research_institute_crawler','Computer and Automation Research Institute Crawler', 'converamultimediacrawler','ConveraMultiMediaCrawler', 'converacrawler','ConveraCrawler', +'copubbot', 'CoPubbot', 'cscrawler','CsCrawler', 'cse_html_validator_lite_online','CSE HTML Validator Lite Online','cuasarbot','Cuasarbot', 'cursor','Cursor', @@ -1617,6 +1631,7 @@ 'html[_+ ]link[_+ ]validator','Html_Link_Validator', 'httrack','HTTrack off-line browser', 'hundesuche\.com\-bot','Hundesuche.com-Bot', +'i-bot','i-bot', 'ichiro','ichiro', 'iltrovatore\-setaccio','IlTrovatore-Setaccio', 'infobot','InfoBot', @@ -1664,6 +1679,8 @@ 'megite','Megite', 'metager\-linkchecker','MetaGer LinkChecker', 'metaspinner','Metaspinner', +'miadev', 'MiaDev spider', +'microsoft bits', 'Microsoft Background Intelligent Transfer Service (BITS)?', 'microsoft.*discovery', 'Microsoft Office Protocol Discovery/Microsoft Office Existence Discovery', 'microsoft[_+ ]url[_+ ]control','Microsoft URL Control', 'minirank','miniRank', @@ -1733,8 +1750,11 @@ 'schizozilla','Schizozilla', 'scumbot','Scumbot', 'searchguild[_+ ]dmoz[_+ ]experiment','SearchGuild_DMOZ_Experiment', +'searchmetricsbot','SearchmetricsBot', 'seekbot','Seekbot', +'semrushbot', 'SemrushBot', 'sensis_web_crawler','Sensis Web Crawler', +'seokicks\.de', 'SEOkicks Webcrawler', 'seznambot','SeznamBot', 'shim\-crawler','Shim-Crawler', 'shoutcast','Shoutcast Directory Service', @@ -1744,6 +1764,7 @@ 'sohu\-search','sohu-search', 'sohu','sohu agent', 'snappy','Snappy', +'spbot', 'SEOprofiler Bot', 'sphere_scout','Sphere Scout', 'spip','SPIP', 'sproose_crawler','sproose crawler', @@ -1788,6 +1809,9 @@ 'w3c_validator','W3C Validator', 'watchmouse', 'WatchMouse Website Monitor', 'wavefire','Wavefire', +'waybackarchive\.org', 'No website, email: spider(at)waybackarchive.org', +# 2.12.2013 Project Honeypot reports at least one of the IPs used by waybackarchive with a spiderlytics UA string. +# Problably not related to the wayback machine of archive.org. 'webclipping\.com', 'WebClipping.com', 'webcompass', 'webcompass', 'webcrawl\.net','webcrawl.net', @@ -1842,8 +1866,8 @@ 'scanner', 'Unknown robot (identified by \'scanner\')', 'spider', 'Unknown robot (identified by \'spider\')', 'sucker', 'Unknown robot (identified by \'sucker\')', -'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot*\')', -'[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'*bot\')', +'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot\' followed by a space or one of the following characters _+:,.;/\-)', +'[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'bot\' preceded by a space or one of the following characters _+:,.;/\-)', 'curl', 'Common *nix tool for automating web document retireval. Most likely a bot.', 'php', 'A PHP script', 'ruby\/', 'Ruby script', diff --git a/wwwroot/cgi-bin/lib/search_engines.pm b/wwwroot/cgi-bin/lib/search_engines.pm index 215a5422..aa73cd53 100644 --- a/wwwroot/cgi-bin/lib/search_engines.pm +++ b/wwwroot/cgi-bin/lib/search_engines.pm @@ -215,6 +215,7 @@ 'googlee\.', 'googlecom\.com', 'goggle\.co\.hu', +'216\.239\.32\.20', '216\.239\.(35|37|39|51)\.100', '216\.239\.(35|37|39|51)\.101', '216\.239\.5[0-9]\.104', @@ -319,6 +320,11 @@ 'duckduckgo\.com', 'sr\.facemoods\.com', 'shoppstop\.com', +'searchya\.com', +'picsearch\.de', +'webssearches\.com', +'zapmeta\.de', +'localmoxie\.com', # Chello Portals 'chello\.at', 'chello\.be', @@ -417,6 +423,8 @@ 'sumaja\.de', 'navigationshilfe\.t-online\.de', 'umfis\.de', +'fastbot\.de', +'tixuma\.de', # Minor Hungarian search engines 'heureka\.hu','vizsla\.origo\.hu','lapkereso\.hu','goliat\.hu','index\.hu','wahoo\.hu','webmania\.hu','search\.internetto\.hu', 'tango\.hu', @@ -486,6 +494,7 @@ 'googlee\.','google', 'googlecom\.com','google', 'goggle\.co\.hu','google', +'216\.239\.32\.20', 'google', '216\.239\.(35|37|39|51)\.100','google_cache', '216\.239\.(35|37|39|51)\.101','google_cache', '216\.239\.5[0-9]\.104','google_cache', @@ -588,6 +597,11 @@ 'duckduckgo\.com', 'duckduckgo', 'sr\.facemoods\.com', 'facemoods', 'shoppstop\.com', 'shoppstop', +'searchya\.com', 'searchya', +'picsearch\.de', 'picsearch', +'webssearches\.com', 'webssearches', +'zapmeta\.de', 'zapmeta', +'localmoxie\.com', 'localmoxie', # Chello Portals 'chello\.at','chelloat', 'chello\.be','chellobe', @@ -725,6 +739,8 @@ 'sumaja\.de', 'sumaja', 'navigationshilfe\.t-online\.de', 'navigationshilfe', 'umfis\.de', 'umfis', +'fastbot\.de', 'fastbot_de', +'tixuma\.de', 'tixuma_de', # Minor Hungarian search engines 'heureka\.hu','heureka', 'vizsla\.origo\.hu','origo', @@ -909,6 +925,11 @@ 'duckduckgo', 'uddg=', 'facemoods', 'q=', 'shoppstop', 'keywords=', +'searchya', 'q=', +'picsearch', 'q=', +'webssearches', 'q=', +'zapmeta', 'query=', +'localmoxie', 'keyword=', # Chello Portals 'chelloat','q1=', 'chellobe','q1=', @@ -1009,6 +1030,8 @@ #'sumaja', 'no query string available', #There is no query string in the referrer url 'navigationshilfe', 'q=', 'umfis', 'suchbegriff=', +'fastbot_de', 'red=[0-9]*\+', +'tixuma_de', 'sc=', # Minor Hungarian search engines 'heureka','heureka=', 'origo','(q|search)=', 'goliat','KERESES=', 'wahoo','q=', 'internetto','searchstr=', 'keresolap_hu','q=', @@ -1191,6 +1214,11 @@ 'duckduckgo', 'DuckDuckGo (Does not provide search keyphrases, using found page instead)', 'facemoods', 'Facemoods Search', 'shoppstop', 'ShoppStop', +'searchya', 'Searchya', +'picsearch', 'picsearch', +'webssearches', 'Web Searches', +'zapmeta', 'ZapMeta', +'localmoxie', 'Local Moxie', # Chello Portals 'chelloat','Chello Austria', 'chellobe','Chello Belgium', @@ -1293,6 +1321,8 @@ 'sumaja', 'Sumaja', 'navigationshilfe', 'T-Online Navigationshilfe', 'umfis', 'UMFIS-Online Das Umweltfirmen-Informationssystem der IHKs in Deutschland', +'fastbot_de', 'Fastbot.de (Does not provide search keyphrases, using found page instead)', +'tixuma_de', 'Tixuma Deutschland', # Minor hungarian search engines 'heureka','Heureka', 'origo','Origo-Vizsla', 'lapkereso','Startlapkereso', 'goliat','Goliat', 'indexhu','Index', 'wahoo','Wahoo', 'webmania','webmania.hu', 'internetto','Internetto Kereso', 'tango_hu','Tango',