]> git.ipfire.org Git - people/ms/libloc.git/blobdiff - src/python/location-importer.in
location-importer.in: always convert organisation handles into upper cases
[people/ms/libloc.git] / src / python / location-importer.in
index e3a07a07c65eddb7a64f28b49b0ed6a8914c28b5..2dec89e4b9c3f422848b27447d2c555703622b0a 100644 (file)
@@ -152,6 +152,7 @@ class CLI(object):
                                        last_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP);
                                CREATE UNIQUE INDEX IF NOT EXISTS announcements_networks ON announcements(network);
                                CREATE INDEX IF NOT EXISTS announcements_family ON announcements(family(network));
+                               CREATE INDEX IF NOT EXISTS announcements_search ON announcements USING GIST(network inet_ops);
 
                                -- autnums
                                CREATE TABLE IF NOT EXISTS autnums(number bigint, name text NOT NULL);
@@ -165,6 +166,7 @@ class CLI(object):
                                -- networks
                                CREATE TABLE IF NOT EXISTS networks(network inet, country text);
                                CREATE UNIQUE INDEX IF NOT EXISTS networks_network ON networks(network);
+                               CREATE INDEX IF NOT EXISTS networks_family ON networks USING BTREE(family(network));
                                CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
 
                                -- overrides
@@ -188,6 +190,8 @@ class CLI(object):
                                );
                                CREATE UNIQUE INDEX IF NOT EXISTS network_overrides_network
                                        ON network_overrides(network);
+                               CREATE INDEX IF NOT EXISTS network_overrides_search
+                                       ON network_overrides USING GIST(network inet_ops);
                        """)
 
                return db
@@ -234,32 +238,24 @@ class CLI(object):
 
                # Select all known networks
                rows = self.db.query("""
-                       -- Get a (sorted) list of all known networks
-                       WITH known_networks AS (
-                                       SELECT network FROM announcements
-                               UNION
-                                       SELECT network FROM networks
-                               ORDER BY network
-                       )
-
                        -- Return a list of those networks enriched with all
                        -- other information that we store in the database
                        SELECT
-                               DISTINCT ON (known_networks.network)
-                               known_networks.network AS network,
-                               announcements.autnum AS autnum,
+                               DISTINCT ON (network)
+                               network,
+                               autnum,
 
                                -- Country
                                COALESCE(
                                        (
                                                SELECT country FROM network_overrides overrides
-                                                       WHERE announcements.network <<= overrides.network
+                                                       WHERE networks.network <<= overrides.network
                                                        ORDER BY masklen(overrides.network) DESC
                                                        LIMIT 1
                                        ),
                                        (
                                                SELECT country FROM autnum_overrides overrides
-                                                       WHERE announcements.autnum = overrides.number
+                                                       WHERE networks.autnum = overrides.number
                                        ),
                                        networks.country
                                ) AS country,
@@ -268,50 +264,67 @@ class CLI(object):
                                COALESCE(
                                        (
                                                SELECT is_anonymous_proxy FROM network_overrides overrides
-                                                       WHERE announcements.network <<= overrides.network
+                                                       WHERE networks.network <<= overrides.network
                                                        ORDER BY masklen(overrides.network) DESC
                                                        LIMIT 1
                                        ),
                                        (
                                                SELECT is_anonymous_proxy FROM autnum_overrides overrides
-                                                       WHERE announcements.autnum = overrides.number
+                                                       WHERE networks.autnum = overrides.number
                                        ),
                                        FALSE
                                ) AS is_anonymous_proxy,
                                COALESCE(
                                        (
                                                SELECT is_satellite_provider FROM network_overrides overrides
-                                                       WHERE announcements.network <<= overrides.network
+                                                       WHERE networks.network <<= overrides.network
                                                        ORDER BY masklen(overrides.network) DESC
                                                        LIMIT 1
                                        ),
                                        (
                                                SELECT is_satellite_provider FROM autnum_overrides overrides
-                                                       WHERE announcements.autnum = overrides.number
+                                                       WHERE networks.autnum = overrides.number
                                        ),
                                        FALSE
                                ) AS is_satellite_provider,
                                COALESCE(
                                        (
                                                SELECT is_anycast FROM network_overrides overrides
-                                                       WHERE announcements.network <<= overrides.network
+                                                       WHERE networks.network <<= overrides.network
                                                        ORDER BY masklen(overrides.network) DESC
                                                        LIMIT 1
                                        ),
                                        (
                                                SELECT is_anycast FROM autnum_overrides overrides
-                                                       WHERE announcements.autnum = overrides.number
+                                                       WHERE networks.autnum = overrides.number
                                        ),
                                        FALSE
-                               ) AS is_anycast,
-
-                               -- Must be part of returned values for ORDER BY clause
-                               masklen(announcements.network) AS sort_a,
-                               masklen(networks.network) AS sort_b
-                       FROM known_networks
-                               LEFT JOIN announcements ON known_networks.network <<= announcements.network
-                               LEFT JOIN networks ON known_networks.network <<= networks.network
-                       ORDER BY known_networks.network, sort_a DESC, sort_b DESC
+                               ) AS is_anycast
+                       FROM (
+                               SELECT
+                                       known_networks.network AS network,
+                                       announcements.autnum AS autnum,
+                                       networks.country AS country,
+
+                                       -- Must be part of returned values for ORDER BY clause
+                                       masklen(announcements.network) AS sort_a,
+                                       masklen(networks.network) AS sort_b
+                               FROM (
+                                               SELECT network FROM announcements
+                                       UNION ALL
+                                               SELECT network FROM networks
+                                       UNION ALL
+                                               SELECT network FROM network_overrides
+                                       ) known_networks
+                               LEFT JOIN
+                                       announcements ON known_networks.network <<= announcements.network
+                               LEFT JOIN
+                                       networks ON known_networks.network <<= networks.network
+                               ORDER BY
+                                       known_networks.network,
+                                       sort_a DESC,
+                                       sort_b DESC
+                       ) networks
                """)
 
                for row in rows:
@@ -363,6 +376,16 @@ class CLI(object):
                                CREATE TEMPORARY TABLE _organizations(handle text, name text NOT NULL)
                                        ON COMMIT DROP;
                                CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle);
+
+                               CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text NOT NULL)
+                                       ON COMMIT DROP;
+                               CREATE INDEX _rirdata_search ON _rirdata USING BTREE(family(network), masklen(network));
+                               CREATE UNIQUE INDEX _rirdata_network ON _rirdata(network);
+                       """)
+
+                       # Remove all previously imported content
+                       self.db.execute("""
+                               TRUNCATE TABLE networks;
                        """)
 
                        for source in location.importer.WHOIS_SOURCES:
@@ -370,6 +393,67 @@ class CLI(object):
                                        for block in f:
                                                self._parse_block(block)
 
+                       # Process all parsed networks from every RIR we happen to have access to,
+                       # insert the largest network chunks into the networks table immediately...
+                       families = self.db.query("SELECT DISTINCT family(network) AS family FROM _rirdata ORDER BY family(network)")
+
+                       for family in (row.family for row in families):
+                               smallest = self.db.get("SELECT MIN(masklen(network)) AS prefix FROM _rirdata WHERE family(network) = %s", family)
+
+                               self.db.execute("INSERT INTO networks(network, country) \
+                                       SELECT network, country FROM _rirdata WHERE masklen(network) = %s AND family(network) = %s", smallest.prefix, family)
+
+                               # ... determine any other prefixes for this network family, ...
+                               prefixes = self.db.query("SELECT DISTINCT masklen(network) AS prefix FROM _rirdata \
+                                       WHERE family(network) = %s ORDER BY masklen(network) ASC OFFSET 1", family)
+
+                               # ... and insert networks with this prefix in case they provide additional
+                               # information (i. e. subnet of a larger chunk with a different country)
+                               for prefix in (row.prefix for row in prefixes):
+                                       self.db.execute("""
+                                               WITH candidates AS (
+                                                       SELECT
+                                                               _rirdata.network,
+                                                               _rirdata.country
+                                                       FROM
+                                                               _rirdata
+                                                       WHERE
+                                                               family(_rirdata.network) = %s
+                                                       AND
+                                                               masklen(_rirdata.network) = %s
+                                               ),
+                                               filtered AS (
+                                                       SELECT
+                                                               DISTINCT ON (c.network)
+                                                               c.network,
+                                                               c.country,
+                                                               masklen(networks.network),
+                                                               networks.country AS parent_country
+                                                       FROM
+                                                               candidates c
+                                                       LEFT JOIN
+                                                               networks
+                                                       ON
+                                                               c.network << networks.network
+                                                       ORDER BY
+                                                               c.network,
+                                                               masklen(networks.network) DESC NULLS LAST
+                                               )
+                                               INSERT INTO
+                                                       networks(network, country)
+                                               SELECT
+                                                       network,
+                                                       country
+                                               FROM
+                                                       filtered
+                                               WHERE
+                                                       parent_country IS NULL
+                                               OR
+                                                       country <> parent_country
+                                               ON CONFLICT DO NOTHING""",
+                                               family, prefix,
+                                       )
+
                        self.db.execute("""
                                INSERT INTO autnums(number, name)
                                        SELECT _autnums.number, _organizations.name FROM _autnums
@@ -385,6 +469,69 @@ class CLI(object):
                                        for line in f:
                                                self._parse_line(line)
 
+       def _check_parsed_network(self, network):
+               """
+                       Assistive function to detect and subsequently sort out parsed
+                       networks from RIR data (both Whois and so-called "extended sources"),
+                       which are or have...
+
+                       (a) not globally routable (RFC 1918 space, et al.)
+                       (b) covering a too large chunk of the IP address space (prefix length
+                               is < 7 for IPv4 networks, and < 10 for IPv6)
+                       (c) "0.0.0.0" or "::" as a network address
+                       (d) are too small for being publicly announced (we have decided not to
+                               process them at the moment, as they significantly enlarge our
+                               database without providing very helpful additional information)
+
+                       This unfortunately is necessary due to brain-dead clutter across
+                       various RIR databases, causing mismatches and eventually disruptions.
+
+                       We will return False in case a network is not suitable for adding
+                       it to our database, and True otherwise.
+               """
+
+               if not network or not (isinstance(network, ipaddress.IPv4Network) or isinstance(network, ipaddress.IPv6Network)):
+                       return False
+
+               if not network.is_global:
+                       log.warning("Skipping non-globally routable network: %s" % network)
+                       return False
+
+               if network.version == 4:
+                       if network.prefixlen < 7:
+                               log.warning("Skipping too big IP chunk: %s" % network)
+                               return False
+
+                       if network.prefixlen > 24:
+                               log.debug("Skipping network too small to be publicly announced: %s" % network)
+                               return False
+
+                       if str(network.network_address) == "0.0.0.0":
+                               log.warning("Skipping network based on 0.0.0.0: %s" % network)
+                               return False
+
+               elif network.version == 6:
+                       if network.prefixlen < 10:
+                               log.warning("Skipping too big IP chunk: %s" % network)
+                               return False
+
+                       if network.prefixlen > 48:
+                               log.debug("Skipping network too small to be publicly announced: %s" % network)
+                               return False
+
+                       if str(network.network_address) == "::":
+                               log.warning("Skipping network based on '::': %s" % network)
+                               return False
+
+               else:
+                       # This should not happen...
+                       log.warning("Skipping network of unknown family, this should not happen: %s" % network)
+                       return False
+
+               # In case we have made it here, the network is considered to
+               # be suitable for libloc consumption...
+               return True
+
        def _parse_block(self, block):
                # Get first line to find out what type of block this is
                line = block[0]
@@ -413,7 +560,7 @@ class CLI(object):
                                        autnum["asn"] = m.group(2)
 
                        elif key == "org":
-                               autnum[key] = val
+                               autnum[key] = val.upper()
 
                # Skip empty objects
                if not autnum:
@@ -427,15 +574,22 @@ class CLI(object):
                )
 
        def _parse_inetnum_block(self, block):
-               logging.debug("Parsing inetnum block:")
+               log.debug("Parsing inetnum block:")
 
                inetnum = {}
                for line in block:
-                       logging.debug(line)
+                       log.debug(line)
 
                        # Split line
                        key, val = split_line(line)
 
+                       # Filter any inetnum records which are only referring to IP space
+                       # not managed by that specific RIR...
+                       if key == "netname":
+                               if re.match(r"(ERX-NETBLOCK|(AFRINIC|ARIN|LACNIC|RIPE)-CIDR-BLOCK|IANA-NETBLOCK-\d{1,3}|NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK)", val.strip()):
+                                       log.warning("Skipping record indicating historic/orphaned data: %s" % val.strip())
+                                       return
+
                        if key == "inetnum":
                                start_address, delim, end_address = val.partition("-")
 
@@ -447,7 +601,7 @@ class CLI(object):
                                        start_address = ipaddress.ip_address(start_address)
                                        end_address   = ipaddress.ip_address(end_address)
                                except ValueError:
-                                       logging.warning("Could not parse line: %s" % line)
+                                       log.warning("Could not parse line: %s" % line)
                                        return
 
                                # Set prefix to default
@@ -464,23 +618,24 @@ class CLI(object):
                                inetnum[key] = val
 
                        elif key == "country":
-                               if val == "UNITED STATES":
-                                       val = "US"
-
                                inetnum[key] = val.upper()
 
                # Skip empty objects
-               if not inetnum:
+               if not inetnum or not "country" in inetnum:
+                       return
+
+               # Skip objects with bogus country code 'ZZ'
+               if inetnum.get("country") == "ZZ":
+                       log.warning("Skipping network with bogus country 'ZZ': %s" % \
+                               (inetnum.get("inet6num") or inetnum.get("inetnum")))
                        return
 
                network = ipaddress.ip_network(inetnum.get("inet6num") or inetnum.get("inetnum"), strict=False)
 
-               # Bail out in case we have processed a non-public IP network
-               if network.is_private:
-                       logging.warning("Skipping non-globally routable network: %s" % network)
+               if not self._check_parsed_network(network):
                        return
 
-               self.db.execute("INSERT INTO networks(network, country) \
+               self.db.execute("INSERT INTO _rirdata(network, country) \
                        VALUES(%s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
                        "%s" % network, inetnum.get("country"),
                )
@@ -491,7 +646,9 @@ class CLI(object):
                        # Split line
                        key, val = split_line(line)
 
-                       if key in ("organisation", "org-name"):
+                       if key == "organisation":
+                               org[key] = val.upper()
+                       elif key == "org-name":
                                org[key] = val
 
                # Skip empty objects
@@ -561,6 +718,9 @@ class CLI(object):
                        log.warning("Invalid IP address: %s" % address)
                        return
 
+               if not self._check_parsed_network(network):
+                       return
+
                self.db.execute("INSERT INTO networks(network, country) \
                        VALUES(%s, %s) ON CONFLICT (network) DO \
                        UPDATE SET country = excluded.country",