]> git.ipfire.org Git - people/ms/libloc.git/blobdiff - src/scripts/location-importer.in
importer: Improve checks for unspecified networks
[people/ms/libloc.git] / src / scripts / location-importer.in
index 7788d8c067bc8b6cf91780042d865e6e1bd6fa4d..4b340374226a0d70b971ccc354a4a55311dc7717 100644 (file)
@@ -48,6 +48,19 @@ VALID_ASN_RANGES = (
        (131072, 4199999999),
 )
 
+TRANSLATED_COUNTRIES = {
+       # When people say UK, they mean GB
+       "UK" : "GB",
+}
+
+IGNORED_COUNTRIES = set((
+       # Formerly Yugoslavia
+       "YU",
+
+       # Some people use ZZ to say "no country" or to hide the country
+       "ZZ",
+))
+
 # Configure the CSV parser for ARIN
 csv.register_dialect("arin", delimiter=",", quoting=csv.QUOTE_ALL, quotechar="\"")
 
@@ -294,7 +307,7 @@ class CLI(object):
                # Fetch all valid country codes to check parsed networks aganist...
                countries = self.db.query("SELECT country_code FROM countries ORDER BY country_code")
 
-               return [country.country_code for country in countries]
+               return set((country.country_code for country in countries))
 
        def handle_write(self, ns):
                """
@@ -680,7 +693,12 @@ class CLI(object):
                error = False
 
                # Fetch all valid country codes to check parsed networks against
-               validcountries = self.fetch_countries()
+               countries = self.fetch_countries()
+
+               # Check if we have countries
+               if not countries:
+                       log.error("Please import countries before importing any WHOIS data")
+                       return 1
 
                # Iterate over all potential sources
                for source in sorted(location.importer.SOURCES):
@@ -695,7 +713,7 @@ class CLI(object):
                                                name text NOT NULL, source text NOT NULL) ON COMMIT DROP;
                                        CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle);
 
-                                       CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text NOT NULL,
+                                       CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text,
                                                original_countries text[] NOT NULL, source text NOT NULL)
                                                ON COMMIT DROP;
                                        CREATE INDEX _rirdata_search ON _rirdata
@@ -711,12 +729,12 @@ class CLI(object):
                                        # Fetch WHOIS sources
                                        for url in location.importer.WHOIS_SOURCES.get(source, []):
                                                for block in downloader.request_blocks(url):
-                                                       self._parse_block(block, source, validcountries)
+                                                       self._parse_block(block, source, countries)
 
                                        # Fetch extended sources
                                        for url in location.importer.EXTENDED_SOURCES.get(source, []):
                                                for line in downloader.request_lines(url):
-                                                       self._parse_line(line, source, validcountries)
+                                                       self._parse_line(line, source, countries)
                                except urllib.error.URLError as e:
                                        log.error("Could not retrieve data from %s: %s" % (source, e))
                                        error = True
@@ -896,14 +914,24 @@ class CLI(object):
                        We will return False in case a network is not suitable for adding
                        it to our database, and True otherwise.
                """
+               # Check input
+               if isinstance(network, ipaddress.IPv6Network):
+                       pass
+               elif isinstance(network, ipaddress.IPv4Network):
+                       pass
+               else:
+                       raise ValueError("Invalid network: %s (type %s)" % (network, type(network)))
 
-               if not network or not (isinstance(network, ipaddress.IPv4Network) or isinstance(network, ipaddress.IPv6Network)):
-                       return False
-
+               # Ignore anything that isn't globally routable
                if not network.is_global:
                        log.debug("Skipping non-globally routable network: %s" % network)
                        return False
 
+               # Ignore anything that is unspecified IP range (See RFC 5735 for IPv4 or RFC 2373 for IPv6)
+               elif network.is_unspecified:
+                       log.debug("Skipping unspecified network: %s" % network)
+                       return False
+
                if network.version == 4:
                        if network.prefixlen < 7:
                                log.debug("Skipping too big IP chunk: %s" % network)
@@ -913,10 +941,6 @@ class CLI(object):
                                log.debug("Skipping network too small to be publicly announced: %s" % network)
                                return False
 
-                       if str(network.network_address) == "0.0.0.0":
-                               log.debug("Skipping network based on 0.0.0.0: %s" % network)
-                               return False
-
                elif network.version == 6:
                        if network.prefixlen < 10:
                                log.debug("Skipping too big IP chunk: %s" % network)
@@ -926,15 +950,6 @@ class CLI(object):
                                log.debug("Skipping network too small to be publicly announced: %s" % network)
                                return False
 
-                       if str(network.network_address) == "::":
-                               log.debug("Skipping network based on '::': %s" % network)
-                               return False
-
-               else:
-                       # This should not happen...
-                       log.warning("Skipping network of unknown family, this should not happen: %s" % network)
-                       return False
-
                # In case we have made it here, the network is considered to
                # be suitable for libloc consumption...
                return True
@@ -952,7 +967,7 @@ class CLI(object):
                log.info("Supplied ASN %s out of publicly routable ASN ranges" % asn)
                return False
 
-       def _parse_block(self, block, source_key, validcountries = None):
+       def _parse_block(self, block, source_key, countries):
                # Get first line to find out what type of block this is
                line = block[0]
 
@@ -962,7 +977,7 @@ class CLI(object):
 
                # inetnum
                if line.startswith("inet6num:") or line.startswith("inetnum:"):
-                       return self._parse_inetnum_block(block, source_key, validcountries)
+                       return self._parse_inetnum_block(block, source_key, countries)
 
                # organisation
                elif line.startswith("organisation:"):
@@ -1015,7 +1030,7 @@ class CLI(object):
                        autnum.get("asn"), autnum.get("org"), source_key,
                )
 
-       def _parse_inetnum_block(self, block, source_key, validcountries = None):
+       def _parse_inetnum_block(self, block, source_key, countries):
                log.debug("Parsing inetnum block:")
 
                inetnum = {}
@@ -1081,21 +1096,28 @@ class CLI(object):
                                inetnum[key] = [ipaddress.ip_network(val, strict=False)]
 
                        elif key == "country":
-                               val = val.upper()
+                               cc = val.upper()
 
-                               # Catch RIR data objects with more than one country code...
-                               if not key in inetnum:
-                                       inetnum[key] = []
-                               else:
-                                       if val in inetnum.get("country"):
-                                               # ... but keep this list distinct...
-                                               continue
+                               # Ignore certain country codes
+                               if cc in IGNORED_COUNTRIES:
+                                       log.debug("Ignoring country code '%s'" % cc)
+                                       continue
 
-                               # When people set country codes to "UK", they actually mean "GB"
-                               if val == "UK":
-                                       val = "GB"
+                               # Translate country codes
+                               try:
+                                       cc = TRANSLATED_COUNTRIES[cc]
+                               except KeyError:
+                                       pass
+
+                               # Do we know this country?
+                               if not cc in countries:
+                                       log.warning("Skipping invalid country code '%s'" % cc)
+                                       continue
 
-                               inetnum[key].append(val)
+                               try:
+                                       inetnum[key].append(cc)
+                               except KeyError:
+                                       inetnum[key] = [cc]
 
                        # Parse the geofeed attribute
                        elif key == "geofeed":
@@ -1108,37 +1130,51 @@ class CLI(object):
                                        inetnum["geofeed"] = m.group(1)
 
                # Skip empty objects
-               if not inetnum or not "country" in inetnum:
+               if not inetnum:
                        return
 
-               # Prepare skipping objects with unknown country codes...
-               invalidcountries = [singlecountry for singlecountry in inetnum.get("country") if singlecountry not in validcountries]
-
                # Iterate through all networks enumerated from above, check them for plausibility and insert
                # them into the database, if _check_parsed_network() succeeded
                for single_network in inetnum.get("inet6num") or inetnum.get("inetnum"):
-                       if self._check_parsed_network(single_network):
-                               # Skip objects with unknown country codes if they are valid to avoid log spam...
-                               if validcountries and invalidcountries:
-                                       log.warning("Skipping network with bogus countr(y|ies) %s (original countries: %s): %s" % \
-                                               (invalidcountries, inetnum.get("country"), inetnum.get("inet6num") or inetnum.get("inetnum")))
-                                       break
+                       if not self._check_parsed_network(single_network):
+                               continue
 
-                               # Everything is fine here, run INSERT statement...
-                               self.db.execute("INSERT INTO _rirdata(network, country, original_countries, source) \
-                                       VALUES(%s, %s, %s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
-                                       "%s" % single_network, inetnum.get("country")[0], inetnum.get("country"), source_key,
+                       # Fetch the countries or use a list with an empty country
+                       countries = inetnum.get("country", [None])
+
+                       # Insert the network into the database but only use the first country code
+                       for cc in countries:
+                               self.db.execute("""
+                                       INSERT INTO
+                                               _rirdata
+                                       (
+                                               network,
+                                               country,
+                                               original_countries,
+                                               source
+                                       )
+                                       VALUES
+                                       (
+                                               %s, %s, %s, %s
+                                       )
+                                       ON CONFLICT (network)
+                                               DO UPDATE SET country = excluded.country
+                                       """, "%s" % single_network, cc, [cc for cc in countries if cc], source_key,
                                )
 
-                               # Update any geofeed information
-                               geofeed = inetnum.get("geofeed", None)
-                               if geofeed:
-                                       self._parse_geofeed(geofeed, single_network)
+                               # If there are more than one country, we will only use the first one
+                               break
 
-                               # Delete any previous geofeeds
-                               else:
-                                       self.db.execute("DELETE FROM network_geofeeds WHERE network = %s",
-                                               "%s" % single_network)
+                       # Update any geofeed information
+                       geofeed = inetnum.get("geofeed", None)
+                       if geofeed:
+                               self._parse_geofeed(geofeed, single_network)
+
+                       # Delete any previous geofeeds
+                       else:
+                               self.db.execute(
+                                       "DELETE FROM network_geofeeds WHERE network = %s", "%s" % single_network,
+                               )
 
        def _parse_geofeed(self, url, single_network):
                # Parse the URL
@@ -1188,7 +1224,7 @@ class CLI(object):
                        org.get("organisation"), org.get("org-name"), source_key,
                )
 
-       def _parse_line(self, line, source_key, validcountries = None):
+       def _parse_line(self, line, source_key, validcountries=None):
                # Skip version line
                if line.startswith("2"):
                        return
@@ -1203,6 +1239,11 @@ class CLI(object):
                        log.warning("Could not parse line: %s" % line)
                        return
 
+               # Skip any unknown protocols
+               if not type in ("ipv6", "ipv4"):
+                       log.warning("Unknown IP protocol '%s'" % type)
+                       return
+
                # Skip any lines that are for stats only or do not have a country
                # code at all (avoids log spam below)
                if not country_code or country_code == '*':
@@ -1214,10 +1255,6 @@ class CLI(object):
                                (country_code, line))
                        return
 
-               if type in ("ipv6", "ipv4"):
-                       return self._parse_ip_line(country_code, type, line, source_key)
-
-       def _parse_ip_line(self, country, type, line, source_key):
                try:
                        address, prefix, date, status, organization = line.split("|")
                except ValueError:
@@ -1255,10 +1292,22 @@ class CLI(object):
                if not self._check_parsed_network(network):
                        return
 
-               self.db.execute("INSERT INTO networks(network, country, original_countries, source) \
-                       VALUES(%s, %s, %s, %s) ON CONFLICT (network) DO \
-                       UPDATE SET country = excluded.country",
-                       "%s" % network, country, [country], source_key,
+               self.db.execute("""
+                       INSERT INTO
+                               networks
+                       (
+                               network,
+                               country,
+                               original_countries,
+                               source
+                       )
+                       VALUES
+                       (
+                               %s, %s, %s, %s
+                       )
+                       ON CONFLICT (network)
+                               DO UPDATE SET country = excluded.country
+                       """, "%s" % network, country_code, [country], source_key,
                )
 
        def _import_as_names_from_arin(self, downloader):