]> git.ipfire.org Git - people/ms/libloc.git/blobdiff - src/scripts/location-importer.in
importer: Make the export 200x faster
[people/ms/libloc.git] / src / scripts / location-importer.in
index afd1fd05d5f78b6d4fcc58fc9c9b463f4929cebb..80290ffe953502657614e67a280c5cad05ff3c0a 100644 (file)
@@ -369,296 +369,416 @@ class CLI(object):
                # Add all networks
                log.info("Writing networks...")
 
-               # Select all known networks
-               rows = self.db.query("""
-                       WITH known_networks AS (
-                               SELECT network FROM announcements
-                               UNION
-                               SELECT network FROM networks
-                               UNION
-                               SELECT network FROM network_feeds
-                               UNION
-                               SELECT network FROM network_overrides
-                               UNION
-                               SELECT network FROM geofeed_networks
-                       ),
-
-                       ordered_networks AS (
-                               SELECT
-                                       known_networks.network AS network,
-                                       announcements.autnum AS autnum,
-                                       networks.country AS country,
+               # Create a new temporary table where we collect
+               # the networks that we are interested in
+               self.db.execute("""
+                       CREATE TEMPORARY TABLE
+                               n
+                       (
+                               network               inet NOT NULL,
+                               autnum                integer,
+                               country               text,
+                               is_anonymous_proxy    boolean,
+                               is_satellite_provider boolean,
+                               is_anycast            boolean,
+                               is_drop               boolean
+                       )
+                       WITH (FILLFACTOR = 50)
+               """)
 
-                                       -- Must be part of returned values for ORDER BY clause
-                                       masklen(announcements.network) AS sort_a,
-                                       masklen(networks.network) AS sort_b
-                               FROM
-                                       known_networks
-                               LEFT JOIN
-                                       announcements ON known_networks.network <<= announcements.network
-                               LEFT JOIN
-                                       networks ON known_networks.network <<= networks.network
-                               ORDER BY
-                                       known_networks.network,
-                                       sort_a DESC,
-                                       sort_b DESC
+               # Add all known networks
+               self.db.execute("""
+                       INSERT INTO
+                               n
+                       (
+                               network
                        )
 
-                       -- Return a list of those networks enriched with all
-                       -- other information that we store in the database
                        SELECT
-                               DISTINCT ON (network)
-                               network,
-                               autnum,
+                               network
+                       FROM
+                               announcements
 
-                               -- Country
-                               COALESCE(
-                                       (
-                                               SELECT
-                                                       country
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
-                                               ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       country
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       (
-                                               SELECT
-                                                       country
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
-                                               ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       country
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
-                                               ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       geofeed_networks.country AS country
-                                               FROM
-                                                       network_geofeeds
+                       UNION
 
-                                               -- Join the data from the geofeeds
-                                               LEFT JOIN
-                                                       geofeeds ON network_geofeeds.url = geofeeds.url
-                                               LEFT JOIN
-                                                       geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id
+                       SELECT
+                               network
+                       FROM
+                               networks
 
-                                               -- Check whether we have a geofeed for this network
-                                               WHERE
-                                                       networks.network <<= network_geofeeds.network
-                                               AND
-                                                       networks.network <<= geofeed_networks.network
+                       UNION
 
-                                               -- Filter for the best result
-                                               ORDER BY
-                                                       masklen(geofeed_networks.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       networks.country
-                               ) AS country,
+                       SELECT
+                               network
+                       FROM
+                               network_feeds
 
-                               -- Flags
-                               COALESCE(
-                                       (
-                                               SELECT
-                                                       is_anonymous_proxy
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
-                                               ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_anonymous_proxy
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
-                                               ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_anonymous_proxy
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
-                                               ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_anonymous_proxy
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       FALSE
-                               ) AS is_anonymous_proxy,
-                               COALESCE(
-                                       (
-                                               SELECT
-                                                       is_satellite_provider
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
-                                               ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_satellite_provider
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
-                                               ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_satellite_provider
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
-                                               ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_satellite_provider
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       FALSE
-                               ) AS is_satellite_provider,
-                               COALESCE(
-                                       (
-                                               SELECT
-                                                       is_anycast
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
-                                               ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
+                       UNION
+
+                       SELECT
+                               network
+                       FROM
+                               network_overrides
+
+                       UNION
+
+                       SELECT
+                               network
+                       FROM
+                               geofeed_networks
+               """)
+
+               # Create an index to search through networks faster
+               self.db.execute("""
+                       CREATE INDEX
+                               n_search
+                       ON
+                               n
+                       USING
+                               SPGIST(network)
+               """)
+
+               # Analyze n
+               self.db.execute("ANALYZE n")
+
+               # Apply the AS number to all networks
+               self.db.execute("""
+                       -- Join all networks together with their most specific announcements
+                       WITH announcements AS (
+                               SELECT
+                                       n.network,
+                                       announcements.autnum,
+
+                                       -- Sort all merges and number them so
+                                       -- that we can later select the best one
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_anycast
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
+                                                       masklen(announcements.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       announcements
+                               ON
+                                       announcements.network >>= n.network
+                       )
+
+                       -- Store the result
+                       UPDATE
+                               n
+                       SET
+                               autnum = announcements.autnum
+                       FROM
+                               announcements
+                       WHERE
+                               announcements.network = n.network
+                       AND
+                               announcements.row = 1
+                       """,
+               )
+
+               # Apply country information
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       networks.country,
+
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_anycast
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_anycast
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       FALSE
-                               ) AS is_anycast,
-                               COALESCE(
+                                                       masklen(networks.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       networks
+                               ON
+                                       networks.network >>= n.network
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country = networks.country
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+                       AND
+                               networks.row = 1
+                       """,
+               )
+
+               # Add all country information from Geofeeds
+               self.db.execute("""
+                       WITH geofeeds AS (
+                               SELECT
+                                       DISTINCT ON (geofeed_networks.network)
+                                       geofeed_networks.network,
+                                       geofeed_networks.country
+                               FROM
+                                       geofeeds
+                               JOIN
+                                       network_geofeeds networks
+                               ON
+                                       geofeeds.url = networks.url
+                               JOIN
+                                       geofeed_networks
+                               ON
+                                       geofeeds.id = geofeed_networks.geofeed_id
+                               AND
+                                       networks.network >>= geofeed_networks.network
+                       ),
+
+                       networks AS (
+                               SELECT
+                                       n.network,
+                                       geofeeds.country,
+
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_drop
-                                               FROM
-                                                       network_overrides overrides
-                                               WHERE
-                                                       networks.network <<= overrides.network
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       masklen(overrides.network) DESC
-                                               LIMIT 1
-                                       ),
+                                                       masklen(geofeeds.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       geofeeds
+                               ON
+                                       geofeeds.network >>= n.network
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country = networks.country
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+                       AND
+                               networks.row = 1
+                       """,
+               )
+
+               # Apply country and flags from feeds
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       network_feeds.country,
+
+                                       -- Flags
+                                       network_feeds.is_anonymous_proxy,
+                                       network_feeds.is_satellite_provider,
+                                       network_feeds.is_anycast,
+                                       network_feeds.is_drop,
+
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_drop
-                                               FROM
-                                                       network_feeds feeds
-                                               WHERE
-                                                       networks.network <<= feeds.network
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       masklen(feeds.network) DESC
-                                               LIMIT 1
-                                       ),
+                                                       masklen(network_feeds.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       network_feeds
+                               ON
+                                       network_feeds.network >>= n.network
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country =
+                                       COALESCE(networks.country, n.country),
+
+                               is_anonymous_proxy =
+                                       COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy),
+
+                               is_satellite_provider =
+                                       COALESCE(networks.is_satellite_provider, n.is_satellite_provider),
+
+                               is_anycast =
+                                       COALESCE(networks.is_anycast, n.is_anycast),
+
+                               is_drop =
+                                       COALESCE(networks.is_drop, n.is_drop)
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+                       AND
+                               networks.row = 1
+                       """,
+               )
+
+               # Apply country and flags from AS feeds
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       autnum_feeds.country,
+
+                                       -- Flags
+                                       autnum_feeds.is_anonymous_proxy,
+                                       autnum_feeds.is_satellite_provider,
+                                       autnum_feeds.is_anycast,
+                                       autnum_feeds.is_drop
+                               FROM
+                                       n
+                               JOIN
+                                       autnum_feeds
+                               ON
+                                       autnum_feeds.number = n.autnum
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country =
+                                       COALESCE(networks.country, n.country),
+
+                               is_anonymous_proxy =
+                                       COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy),
+
+                               is_satellite_provider =
+                                       COALESCE(networks.is_satellite_provider, n.is_satellite_provider),
+
+                               is_anycast =
+                                       COALESCE(networks.is_anycast, n.is_anycast),
+
+                               is_drop =
+                                       COALESCE(networks.is_drop, n.is_drop)
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+               """)
+
+               # Apply network overrides
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       network_overrides.country,
+
+                                       -- Flags
+                                       network_overrides.is_anonymous_proxy,
+                                       network_overrides.is_satellite_provider,
+                                       network_overrides.is_anycast,
+                                       network_overrides.is_drop,
+
+                                       ROW_NUMBER()
+                                       OVER
                                        (
-                                               SELECT
-                                                       is_drop
-                                               FROM
-                                                       autnum_feeds feeds
-                                               WHERE
-                                                       networks.autnum = feeds.number
+                                               PARTITION BY
+                                                       n.network
                                                ORDER BY
-                                                       source
-                                               LIMIT 1
-                                       ),
-                                       (
-                                               SELECT
-                                                       is_drop
-                                               FROM
-                                                       autnum_overrides overrides
-                                               WHERE
-                                                       networks.autnum = overrides.number
-                                       ),
-                                       FALSE
-                               ) AS is_drop
+                                                       masklen(network_overrides.network) DESC
+                                       ) AS row
+                               FROM
+                                       n
+                               JOIN
+                                       network_overrides
+                               ON
+                                       network_overrides.network >>= n.network
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country =
+                                       COALESCE(networks.country, n.country),
+
+                               is_anonymous_proxy =
+                                       COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy),
+
+                               is_satellite_provider =
+                                       COALESCE(networks.is_satellite_provider, n.is_satellite_provider),
+
+                               is_anycast =
+                                       COALESCE(networks.is_anycast, n.is_anycast),
+
+                               is_drop =
+                                       COALESCE(networks.is_drop, n.is_drop)
                        FROM
-                               ordered_networks networks
+                               networks
+                       WHERE
+                               networks.network = n.network
+                       AND
+                               networks.row = 1
+               """)
+
+               # Apply AS overrides
+               self.db.execute("""
+                       WITH networks AS (
+                               SELECT
+                                       n.network,
+                                       autnum_overrides.country,
+
+                                       -- Flags
+                                       autnum_overrides.is_anonymous_proxy,
+                                       autnum_overrides.is_satellite_provider,
+                                       autnum_overrides.is_anycast,
+                                       autnum_overrides.is_drop
+                               FROM
+                                       n
+                               JOIN
+                                       autnum_overrides
+                               ON
+                                       autnum_overrides.number = n.autnum
+                       )
+
+                       UPDATE
+                               n
+                       SET
+                               country =
+                                       COALESCE(networks.country, n.country),
+
+                               is_anonymous_proxy =
+                                       COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy),
+
+                               is_satellite_provider =
+                                       COALESCE(networks.is_satellite_provider, n.is_satellite_provider),
+
+                               is_anycast =
+                                       COALESCE(networks.is_anycast, n.is_anycast),
+
+                               is_drop =
+                                       COALESCE(networks.is_drop, n.is_drop)
+                       FROM
+                               networks
+                       WHERE
+                               networks.network = n.network
+               """)
+
+               # Here we could remove some networks that we no longer need, but since we
+               # already have implemented our deduplication/merge algorithm this would not
+               # be necessary.
+
+               # Export the entire temporary table
+               rows = self.db.query("""
+                       SELECT
+                               *
+                       FROM
+                               n
+                       ORDER BY
+                               network
                """)
 
                for row in rows:
@@ -687,7 +807,17 @@ class CLI(object):
 
                # Add all countries
                log.info("Writing countries...")
-               rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
+
+               # Select all countries
+               rows = self.db.query("""
+                       SELECT
+                               *
+                       FROM
+                               countries
+                       ORDER BY
+                               country_code
+                       """,
+               )
 
                for row in rows:
                        c = writer.add_country(row.country_code)