From: Michael Tremer Date: Mon, 18 Mar 2024 16:56:50 +0000 (+0000) Subject: importer: Make the export 200x faster X-Git-Tag: 0.9.18~78 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c0a5f8f4be677c4727b7a896b4ee872b313e0c67;p=location%2Flibloc.git importer: Make the export 200x faster The export became a huge problem here as it was very slow. One large query produced one very large output which was difficult to test and validate as the total export could easily take 17-20 hrs. This patch fixes this so that the export now takes around ~5min. Sadly we are still not using more than one processor core, because PostgreSQL thinks its faster with just the one, but we trade a lot of CPU time for a little bit of extra memory instead. We create a new temporary table that holds all networks that we are interested in. We then run a couple of broken down queries that perform one thing instead of multiple at the same time. First, we add all AS numbers, then all countries which are the longest queries taking now around 20-30s. Then we apply any data from the external feeds and our own overrides. The final export requires about 1.7 GiB of memory because we don't perform any deduplication in the database any more. That has been a lot better implemented in our own code and I don't think that < 2 GiB is a reason for concern. Signed-off-by: Michael Tremer --- diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index afd1fd0..80290ff 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -369,296 +369,416 @@ class CLI(object): # Add all networks log.info("Writing networks...") - # Select all known networks - rows = self.db.query(""" - WITH known_networks AS ( - SELECT network FROM announcements - UNION - SELECT network FROM networks - UNION - SELECT network FROM network_feeds - UNION - SELECT network FROM network_overrides - UNION - SELECT network FROM geofeed_networks - ), - - ordered_networks AS ( - SELECT - known_networks.network AS network, - announcements.autnum AS autnum, - networks.country AS country, + # Create a new temporary table where we collect + # the networks that we are interested in + self.db.execute(""" + CREATE TEMPORARY TABLE + n + ( + network inet NOT NULL, + autnum integer, + country text, + is_anonymous_proxy boolean, + is_satellite_provider boolean, + is_anycast boolean, + is_drop boolean + ) + WITH (FILLFACTOR = 50) + """) - -- Must be part of returned values for ORDER BY clause - masklen(announcements.network) AS sort_a, - masklen(networks.network) AS sort_b - FROM - known_networks - LEFT JOIN - announcements ON known_networks.network <<= announcements.network - LEFT JOIN - networks ON known_networks.network <<= networks.network - ORDER BY - known_networks.network, - sort_a DESC, - sort_b DESC + # Add all known networks + self.db.execute(""" + INSERT INTO + n + ( + network ) - -- Return a list of those networks enriched with all - -- other information that we store in the database SELECT - DISTINCT ON (network) - network, - autnum, + network + FROM + announcements - -- Country - COALESCE( - ( - SELECT - country - FROM - network_overrides overrides - WHERE - networks.network <<= overrides.network - ORDER BY - masklen(overrides.network) DESC - LIMIT 1 - ), - ( - SELECT - country - FROM - autnum_overrides overrides - WHERE - networks.autnum = overrides.number - ), - ( - SELECT - country - FROM - network_feeds feeds - WHERE - networks.network <<= feeds.network - ORDER BY - masklen(feeds.network) DESC - LIMIT 1 - ), - ( - SELECT - country - FROM - autnum_feeds feeds - WHERE - networks.autnum = feeds.number - ORDER BY - source - LIMIT 1 - ), - ( - SELECT - geofeed_networks.country AS country - FROM - network_geofeeds + UNION - -- Join the data from the geofeeds - LEFT JOIN - geofeeds ON network_geofeeds.url = geofeeds.url - LEFT JOIN - geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id + SELECT + network + FROM + networks - -- Check whether we have a geofeed for this network - WHERE - networks.network <<= network_geofeeds.network - AND - networks.network <<= geofeed_networks.network + UNION - -- Filter for the best result - ORDER BY - masklen(geofeed_networks.network) DESC - LIMIT 1 - ), - networks.country - ) AS country, + SELECT + network + FROM + network_feeds - -- Flags - COALESCE( - ( - SELECT - is_anonymous_proxy - FROM - network_overrides overrides - WHERE - networks.network <<= overrides.network - ORDER BY - masklen(overrides.network) DESC - LIMIT 1 - ), - ( - SELECT - is_anonymous_proxy - FROM - network_feeds feeds - WHERE - networks.network <<= feeds.network - ORDER BY - masklen(feeds.network) DESC - LIMIT 1 - ), - ( - SELECT - is_anonymous_proxy - FROM - autnum_feeds feeds - WHERE - networks.autnum = feeds.number - ORDER BY - source - LIMIT 1 - ), - ( - SELECT - is_anonymous_proxy - FROM - autnum_overrides overrides - WHERE - networks.autnum = overrides.number - ), - FALSE - ) AS is_anonymous_proxy, - COALESCE( - ( - SELECT - is_satellite_provider - FROM - network_overrides overrides - WHERE - networks.network <<= overrides.network - ORDER BY - masklen(overrides.network) DESC - LIMIT 1 - ), - ( - SELECT - is_satellite_provider - FROM - network_feeds feeds - WHERE - networks.network <<= feeds.network - ORDER BY - masklen(feeds.network) DESC - LIMIT 1 - ), - ( - SELECT - is_satellite_provider - FROM - autnum_feeds feeds - WHERE - networks.autnum = feeds.number - ORDER BY - source - LIMIT 1 - ), - ( - SELECT - is_satellite_provider - FROM - autnum_overrides overrides - WHERE - networks.autnum = overrides.number - ), - FALSE - ) AS is_satellite_provider, - COALESCE( - ( - SELECT - is_anycast - FROM - network_overrides overrides - WHERE - networks.network <<= overrides.network - ORDER BY - masklen(overrides.network) DESC - LIMIT 1 - ), + UNION + + SELECT + network + FROM + network_overrides + + UNION + + SELECT + network + FROM + geofeed_networks + """) + + # Create an index to search through networks faster + self.db.execute(""" + CREATE INDEX + n_search + ON + n + USING + SPGIST(network) + """) + + # Analyze n + self.db.execute("ANALYZE n") + + # Apply the AS number to all networks + self.db.execute(""" + -- Join all networks together with their most specific announcements + WITH announcements AS ( + SELECT + n.network, + announcements.autnum, + + -- Sort all merges and number them so + -- that we can later select the best one + ROW_NUMBER() + OVER ( - SELECT - is_anycast - FROM - network_feeds feeds - WHERE - networks.network <<= feeds.network + PARTITION BY + n.network ORDER BY - masklen(feeds.network) DESC - LIMIT 1 - ), + masklen(announcements.network) DESC + ) AS row + FROM + n + JOIN + announcements + ON + announcements.network >>= n.network + ) + + -- Store the result + UPDATE + n + SET + autnum = announcements.autnum + FROM + announcements + WHERE + announcements.network = n.network + AND + announcements.row = 1 + """, + ) + + # Apply country information + self.db.execute(""" + WITH networks AS ( + SELECT + n.network, + networks.country, + + ROW_NUMBER() + OVER ( - SELECT - is_anycast - FROM - autnum_feeds feeds - WHERE - networks.autnum = feeds.number + PARTITION BY + n.network ORDER BY - source - LIMIT 1 - ), - ( - SELECT - is_anycast - FROM - autnum_overrides overrides - WHERE - networks.autnum = overrides.number - ), - FALSE - ) AS is_anycast, - COALESCE( + masklen(networks.network) DESC + ) AS row + FROM + n + JOIN + networks + ON + networks.network >>= n.network + ) + + UPDATE + n + SET + country = networks.country + FROM + networks + WHERE + networks.network = n.network + AND + networks.row = 1 + """, + ) + + # Add all country information from Geofeeds + self.db.execute(""" + WITH geofeeds AS ( + SELECT + DISTINCT ON (geofeed_networks.network) + geofeed_networks.network, + geofeed_networks.country + FROM + geofeeds + JOIN + network_geofeeds networks + ON + geofeeds.url = networks.url + JOIN + geofeed_networks + ON + geofeeds.id = geofeed_networks.geofeed_id + AND + networks.network >>= geofeed_networks.network + ), + + networks AS ( + SELECT + n.network, + geofeeds.country, + + ROW_NUMBER() + OVER ( - SELECT - is_drop - FROM - network_overrides overrides - WHERE - networks.network <<= overrides.network + PARTITION BY + n.network ORDER BY - masklen(overrides.network) DESC - LIMIT 1 - ), + masklen(geofeeds.network) DESC + ) AS row + FROM + n + JOIN + geofeeds + ON + geofeeds.network >>= n.network + ) + + UPDATE + n + SET + country = networks.country + FROM + networks + WHERE + networks.network = n.network + AND + networks.row = 1 + """, + ) + + # Apply country and flags from feeds + self.db.execute(""" + WITH networks AS ( + SELECT + n.network, + network_feeds.country, + + -- Flags + network_feeds.is_anonymous_proxy, + network_feeds.is_satellite_provider, + network_feeds.is_anycast, + network_feeds.is_drop, + + ROW_NUMBER() + OVER ( - SELECT - is_drop - FROM - network_feeds feeds - WHERE - networks.network <<= feeds.network + PARTITION BY + n.network ORDER BY - masklen(feeds.network) DESC - LIMIT 1 - ), + masklen(network_feeds.network) DESC + ) AS row + FROM + n + JOIN + network_feeds + ON + network_feeds.network >>= n.network + ) + + UPDATE + n + SET + country = + COALESCE(networks.country, n.country), + + is_anonymous_proxy = + COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy), + + is_satellite_provider = + COALESCE(networks.is_satellite_provider, n.is_satellite_provider), + + is_anycast = + COALESCE(networks.is_anycast, n.is_anycast), + + is_drop = + COALESCE(networks.is_drop, n.is_drop) + FROM + networks + WHERE + networks.network = n.network + AND + networks.row = 1 + """, + ) + + # Apply country and flags from AS feeds + self.db.execute(""" + WITH networks AS ( + SELECT + n.network, + autnum_feeds.country, + + -- Flags + autnum_feeds.is_anonymous_proxy, + autnum_feeds.is_satellite_provider, + autnum_feeds.is_anycast, + autnum_feeds.is_drop + FROM + n + JOIN + autnum_feeds + ON + autnum_feeds.number = n.autnum + ) + + UPDATE + n + SET + country = + COALESCE(networks.country, n.country), + + is_anonymous_proxy = + COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy), + + is_satellite_provider = + COALESCE(networks.is_satellite_provider, n.is_satellite_provider), + + is_anycast = + COALESCE(networks.is_anycast, n.is_anycast), + + is_drop = + COALESCE(networks.is_drop, n.is_drop) + FROM + networks + WHERE + networks.network = n.network + """) + + # Apply network overrides + self.db.execute(""" + WITH networks AS ( + SELECT + n.network, + network_overrides.country, + + -- Flags + network_overrides.is_anonymous_proxy, + network_overrides.is_satellite_provider, + network_overrides.is_anycast, + network_overrides.is_drop, + + ROW_NUMBER() + OVER ( - SELECT - is_drop - FROM - autnum_feeds feeds - WHERE - networks.autnum = feeds.number + PARTITION BY + n.network ORDER BY - source - LIMIT 1 - ), - ( - SELECT - is_drop - FROM - autnum_overrides overrides - WHERE - networks.autnum = overrides.number - ), - FALSE - ) AS is_drop + masklen(network_overrides.network) DESC + ) AS row + FROM + n + JOIN + network_overrides + ON + network_overrides.network >>= n.network + ) + + UPDATE + n + SET + country = + COALESCE(networks.country, n.country), + + is_anonymous_proxy = + COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy), + + is_satellite_provider = + COALESCE(networks.is_satellite_provider, n.is_satellite_provider), + + is_anycast = + COALESCE(networks.is_anycast, n.is_anycast), + + is_drop = + COALESCE(networks.is_drop, n.is_drop) FROM - ordered_networks networks + networks + WHERE + networks.network = n.network + AND + networks.row = 1 + """) + + # Apply AS overrides + self.db.execute(""" + WITH networks AS ( + SELECT + n.network, + autnum_overrides.country, + + -- Flags + autnum_overrides.is_anonymous_proxy, + autnum_overrides.is_satellite_provider, + autnum_overrides.is_anycast, + autnum_overrides.is_drop + FROM + n + JOIN + autnum_overrides + ON + autnum_overrides.number = n.autnum + ) + + UPDATE + n + SET + country = + COALESCE(networks.country, n.country), + + is_anonymous_proxy = + COALESCE(networks.is_anonymous_proxy, n.is_anonymous_proxy), + + is_satellite_provider = + COALESCE(networks.is_satellite_provider, n.is_satellite_provider), + + is_anycast = + COALESCE(networks.is_anycast, n.is_anycast), + + is_drop = + COALESCE(networks.is_drop, n.is_drop) + FROM + networks + WHERE + networks.network = n.network + """) + + # Here we could remove some networks that we no longer need, but since we + # already have implemented our deduplication/merge algorithm this would not + # be necessary. + + # Export the entire temporary table + rows = self.db.query(""" + SELECT + * + FROM + n + ORDER BY + network """) for row in rows: @@ -687,7 +807,17 @@ class CLI(object): # Add all countries log.info("Writing countries...") - rows = self.db.query("SELECT * FROM countries ORDER BY country_code") + + # Select all countries + rows = self.db.query(""" + SELECT + * + FROM + countries + ORDER BY + country_code + """, + ) for row in rows: c = writer.add_country(row.country_code)