]> git.ipfire.org Git - people/ms/libloc.git/blob - src/scripts/location-importer.in
4a487c95f0a5ff4b722ca19adeb61e3139b7418f
[people/ms/libloc.git] / src / scripts / location-importer.in
1 #!/usr/bin/python3
2 ###############################################################################
3 # #
4 # libloc - A library to determine the location of someone on the Internet #
5 # #
6 # Copyright (C) 2020-2024 IPFire Development Team <info@ipfire.org> #
7 # #
8 # This library is free software; you can redistribute it and/or #
9 # modify it under the terms of the GNU Lesser General Public #
10 # License as published by the Free Software Foundation; either #
11 # version 2.1 of the License, or (at your option) any later version. #
12 # #
13 # This library is distributed in the hope that it will be useful, #
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
16 # Lesser General Public License for more details. #
17 # #
18 ###############################################################################
19
20 import argparse
21 import concurrent.futures
22 import csv
23 import functools
24 import http.client
25 import ipaddress
26 import json
27 import logging
28 import math
29 import re
30 import socket
31 import sys
32 import urllib.error
33
34 # Load our location module
35 import location
36 import location.database
37 import location.importer
38 from location.i18n import _
39
40 # Initialise logging
41 log = logging.getLogger("location.importer")
42 log.propagate = 1
43
44 # Define constants
45 VALID_ASN_RANGES = (
46 (1, 23455),
47 (23457, 64495),
48 (131072, 4199999999),
49 )
50
51 TRANSLATED_COUNTRIES = {
52 # When people say UK, they mean GB
53 "UK" : "GB",
54 }
55
56 IGNORED_COUNTRIES = set((
57 # Formerly Yugoslavia
58 "YU",
59
60 # Some people use ZZ to say "no country" or to hide the country
61 "ZZ",
62 ))
63
64 # Configure the CSV parser for ARIN
65 csv.register_dialect("arin", delimiter=",", quoting=csv.QUOTE_ALL, quotechar="\"")
66
67 class CLI(object):
68 def parse_cli(self):
69 parser = argparse.ArgumentParser(
70 description=_("Location Importer Command Line Interface"),
71 )
72 subparsers = parser.add_subparsers()
73
74 # Global configuration flags
75 parser.add_argument("--debug", action="store_true",
76 help=_("Enable debug output"))
77 parser.add_argument("--quiet", action="store_true",
78 help=_("Enable quiet mode"))
79
80 # version
81 parser.add_argument("--version", action="version",
82 version="%(prog)s @VERSION@")
83
84 # Database
85 parser.add_argument("--database-host", required=True,
86 help=_("Database Hostname"), metavar=_("HOST"))
87 parser.add_argument("--database-name", required=True,
88 help=_("Database Name"), metavar=_("NAME"))
89 parser.add_argument("--database-username", required=True,
90 help=_("Database Username"), metavar=_("USERNAME"))
91 parser.add_argument("--database-password", required=True,
92 help=_("Database Password"), metavar=_("PASSWORD"))
93
94 # Write Database
95 write = subparsers.add_parser("write", help=_("Write database to file"))
96 write.set_defaults(func=self.handle_write)
97 write.add_argument("file", nargs=1, help=_("Database File"))
98 write.add_argument("--signing-key", nargs="?", type=open, help=_("Signing Key"))
99 write.add_argument("--backup-signing-key", nargs="?", type=open, help=_("Backup Signing Key"))
100 write.add_argument("--vendor", nargs="?", help=_("Sets the vendor"))
101 write.add_argument("--description", nargs="?", help=_("Sets a description"))
102 write.add_argument("--license", nargs="?", help=_("Sets the license"))
103 write.add_argument("--version", type=int, help=_("Database Format Version"))
104
105 # Update WHOIS
106 update_whois = subparsers.add_parser("update-whois", help=_("Update WHOIS Information"))
107 update_whois.set_defaults(func=self.handle_update_whois)
108
109 # Update announcements
110 update_announcements = subparsers.add_parser("update-announcements",
111 help=_("Update BGP Annoucements"))
112 update_announcements.set_defaults(func=self.handle_update_announcements)
113 update_announcements.add_argument("server", nargs=1,
114 help=_("Route Server to connect to"), metavar=_("SERVER"))
115
116 # Update geofeeds
117 update_geofeeds = subparsers.add_parser("update-geofeeds",
118 help=_("Update Geofeeds"))
119 update_geofeeds.set_defaults(func=self.handle_update_geofeeds)
120
121 # Update feeds
122 update_feeds = subparsers.add_parser("update-feeds",
123 help=_("Update Feeds"))
124 update_feeds.add_argument("feeds", nargs="*",
125 help=_("Only update these feeds"))
126 update_feeds.set_defaults(func=self.handle_update_feeds)
127
128 # Update overrides
129 update_overrides = subparsers.add_parser("update-overrides",
130 help=_("Update overrides"),
131 )
132 update_overrides.add_argument(
133 "files", nargs="+", help=_("Files to import"),
134 )
135 update_overrides.set_defaults(func=self.handle_update_overrides)
136
137 # Import countries
138 import_countries = subparsers.add_parser("import-countries",
139 help=_("Import countries"),
140 )
141 import_countries.add_argument("file", nargs=1, type=argparse.FileType("r"),
142 help=_("File to import"))
143 import_countries.set_defaults(func=self.handle_import_countries)
144
145 args = parser.parse_args()
146
147 # Configure logging
148 if args.debug:
149 location.logger.set_level(logging.DEBUG)
150 elif args.quiet:
151 location.logger.set_level(logging.WARNING)
152
153 # Print usage if no action was given
154 if not "func" in args:
155 parser.print_usage()
156 sys.exit(2)
157
158 return args
159
160 def run(self):
161 # Parse command line arguments
162 args = self.parse_cli()
163
164 # Initialise database
165 self.db = self._setup_database(args)
166
167 # Call function
168 ret = args.func(args)
169
170 # Return with exit code
171 if ret:
172 sys.exit(ret)
173
174 # Otherwise just exit
175 sys.exit(0)
176
177 def _setup_database(self, ns):
178 """
179 Initialise the database
180 """
181 # Connect to database
182 db = location.database.Connection(
183 host=ns.database_host, database=ns.database_name,
184 user=ns.database_username, password=ns.database_password,
185 )
186
187 with db.transaction():
188 db.execute("""
189 -- announcements
190 CREATE TABLE IF NOT EXISTS announcements(network inet, autnum bigint,
191 first_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
192 last_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP);
193 CREATE UNIQUE INDEX IF NOT EXISTS announcements_networks ON announcements(network);
194 CREATE INDEX IF NOT EXISTS announcements_family ON announcements(family(network));
195 CREATE INDEX IF NOT EXISTS announcements_search ON announcements USING GIST(network inet_ops);
196
197 -- autnums
198 CREATE TABLE IF NOT EXISTS autnums(number bigint, name text NOT NULL);
199 ALTER TABLE autnums ADD COLUMN IF NOT EXISTS source text;
200 CREATE UNIQUE INDEX IF NOT EXISTS autnums_number ON autnums(number);
201
202 -- countries
203 CREATE TABLE IF NOT EXISTS countries(
204 country_code text NOT NULL, name text NOT NULL, continent_code text NOT NULL);
205 CREATE UNIQUE INDEX IF NOT EXISTS countries_country_code ON countries(country_code);
206
207 -- networks
208 CREATE TABLE IF NOT EXISTS networks(network inet, country text);
209 ALTER TABLE networks ADD COLUMN IF NOT EXISTS original_countries text[];
210 ALTER TABLE networks ADD COLUMN IF NOT EXISTS source text;
211 CREATE UNIQUE INDEX IF NOT EXISTS networks_network ON networks(network);
212 CREATE INDEX IF NOT EXISTS networks_family ON networks USING BTREE(family(network));
213 CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
214
215 -- geofeeds
216 CREATE TABLE IF NOT EXISTS geofeeds(
217 id serial primary key,
218 url text,
219 status integer default null,
220 updated_at timestamp without time zone default null
221 );
222 ALTER TABLE geofeeds ADD COLUMN IF NOT EXISTS error text;
223 CREATE UNIQUE INDEX IF NOT EXISTS geofeeds_unique
224 ON geofeeds(url);
225 CREATE TABLE IF NOT EXISTS geofeed_networks(
226 geofeed_id integer references geofeeds(id) on delete cascade,
227 network inet,
228 country text,
229 region text,
230 city text
231 );
232 CREATE INDEX IF NOT EXISTS geofeed_networks_geofeed_id
233 ON geofeed_networks(geofeed_id);
234 CREATE INDEX IF NOT EXISTS geofeed_networks_search
235 ON geofeed_networks USING GIST(network inet_ops);
236 CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text);
237 CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique
238 ON network_geofeeds(network);
239 CREATE INDEX IF NOT EXISTS network_geofeeds_search
240 ON network_geofeeds USING GIST(network inet_ops);
241 CREATE INDEX IF NOT EXISTS network_geofeeds_url
242 ON network_geofeeds(url);
243
244 -- feeds
245 CREATE TABLE IF NOT EXISTS autnum_feeds(
246 number bigint NOT NULL,
247 source text NOT NULL,
248 name text,
249 country text,
250 is_anonymous_proxy boolean,
251 is_satellite_provider boolean,
252 is_anycast boolean,
253 is_drop boolean
254 );
255 CREATE UNIQUE INDEX IF NOT EXISTS autnum_feeds_unique
256 ON autnum_feeds(number, source);
257
258 CREATE TABLE IF NOT EXISTS network_feeds(
259 network inet NOT NULL,
260 source text NOT NULL,
261 country text,
262 is_anonymous_proxy boolean,
263 is_satellite_provider boolean,
264 is_anycast boolean,
265 is_drop boolean
266 );
267 CREATE UNIQUE INDEX IF NOT EXISTS network_feeds_unique
268 ON network_feeds(network, source);
269 CREATE INDEX IF NOT EXISTS network_feeds_search
270 ON network_feeds USING GIST(network inet_ops);
271
272 -- overrides
273 CREATE TABLE IF NOT EXISTS autnum_overrides(
274 number bigint NOT NULL,
275 name text,
276 country text,
277 is_anonymous_proxy boolean,
278 is_satellite_provider boolean,
279 is_anycast boolean
280 );
281 CREATE UNIQUE INDEX IF NOT EXISTS autnum_overrides_number
282 ON autnum_overrides(number);
283 ALTER TABLE autnum_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
284 ALTER TABLE autnum_overrides DROP COLUMN IF EXISTS source;
285
286 CREATE TABLE IF NOT EXISTS network_overrides(
287 network inet NOT NULL,
288 country text,
289 is_anonymous_proxy boolean,
290 is_satellite_provider boolean,
291 is_anycast boolean
292 );
293 CREATE UNIQUE INDEX IF NOT EXISTS network_overrides_network
294 ON network_overrides(network);
295 CREATE INDEX IF NOT EXISTS network_overrides_search
296 ON network_overrides USING GIST(network inet_ops);
297 ALTER TABLE network_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
298 ALTER TABLE network_overrides DROP COLUMN IF EXISTS source;
299 """)
300
301 return db
302
303 def fetch_countries(self):
304 """
305 Returns a list of all countries on the list
306 """
307 # Fetch all valid country codes to check parsed networks aganist...
308 countries = self.db.query("SELECT country_code FROM countries ORDER BY country_code")
309
310 return [country.country_code for country in countries]
311
312 def handle_write(self, ns):
313 """
314 Compiles a database in libloc format out of what is in the database
315 """
316 # Allocate a writer
317 writer = location.Writer(ns.signing_key, ns.backup_signing_key)
318
319 # Set all metadata
320 if ns.vendor:
321 writer.vendor = ns.vendor
322
323 if ns.description:
324 writer.description = ns.description
325
326 if ns.license:
327 writer.license = ns.license
328
329 # Add all Autonomous Systems
330 log.info("Writing Autonomous Systems...")
331
332 # Select all ASes with a name
333 rows = self.db.query("""
334 SELECT
335 autnums.number AS number,
336 COALESCE(
337 overrides.name,
338 autnums.name
339 ) AS name
340 FROM
341 autnums
342 LEFT JOIN
343 autnum_overrides overrides ON autnums.number = overrides.number
344 ORDER BY
345 autnums.number
346 """)
347
348 for row in rows:
349 # Skip AS without names
350 if not row.name:
351 continue
352
353 a = writer.add_as(row.number)
354 a.name = row.name
355
356 # Add all networks
357 log.info("Writing networks...")
358
359 # Select all known networks
360 rows = self.db.query("""
361 WITH known_networks AS (
362 SELECT network FROM announcements
363 UNION
364 SELECT network FROM networks
365 UNION
366 SELECT network FROM network_feeds
367 UNION
368 SELECT network FROM network_overrides
369 UNION
370 SELECT network FROM geofeed_networks
371 ),
372
373 ordered_networks AS (
374 SELECT
375 known_networks.network AS network,
376 announcements.autnum AS autnum,
377 networks.country AS country,
378
379 -- Must be part of returned values for ORDER BY clause
380 masklen(announcements.network) AS sort_a,
381 masklen(networks.network) AS sort_b
382 FROM
383 known_networks
384 LEFT JOIN
385 announcements ON known_networks.network <<= announcements.network
386 LEFT JOIN
387 networks ON known_networks.network <<= networks.network
388 ORDER BY
389 known_networks.network,
390 sort_a DESC,
391 sort_b DESC
392 )
393
394 -- Return a list of those networks enriched with all
395 -- other information that we store in the database
396 SELECT
397 DISTINCT ON (network)
398 network,
399 autnum,
400
401 -- Country
402 COALESCE(
403 (
404 SELECT
405 country
406 FROM
407 network_overrides overrides
408 WHERE
409 networks.network <<= overrides.network
410 ORDER BY
411 masklen(overrides.network) DESC
412 LIMIT 1
413 ),
414 (
415 SELECT
416 country
417 FROM
418 autnum_overrides overrides
419 WHERE
420 networks.autnum = overrides.number
421 ),
422 (
423 SELECT
424 country
425 FROM
426 network_feeds feeds
427 WHERE
428 networks.network <<= feeds.network
429 ORDER BY
430 masklen(feeds.network) DESC
431 LIMIT 1
432 ),
433 (
434 SELECT
435 country
436 FROM
437 autnum_feeds feeds
438 WHERE
439 networks.autnum = feeds.number
440 ORDER BY
441 source
442 LIMIT 1
443 ),
444 (
445 SELECT
446 geofeed_networks.country AS country
447 FROM
448 network_geofeeds
449
450 -- Join the data from the geofeeds
451 LEFT JOIN
452 geofeeds ON network_geofeeds.url = geofeeds.url
453 LEFT JOIN
454 geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id
455
456 -- Check whether we have a geofeed for this network
457 WHERE
458 networks.network <<= network_geofeeds.network
459 AND
460 networks.network <<= geofeed_networks.network
461
462 -- Filter for the best result
463 ORDER BY
464 masklen(geofeed_networks.network) DESC
465 LIMIT 1
466 ),
467 networks.country
468 ) AS country,
469
470 -- Flags
471 COALESCE(
472 (
473 SELECT
474 is_anonymous_proxy
475 FROM
476 network_overrides overrides
477 WHERE
478 networks.network <<= overrides.network
479 ORDER BY
480 masklen(overrides.network) DESC
481 LIMIT 1
482 ),
483 (
484 SELECT
485 is_anonymous_proxy
486 FROM
487 network_feeds feeds
488 WHERE
489 networks.network <<= feeds.network
490 ORDER BY
491 masklen(feeds.network) DESC
492 LIMIT 1
493 ),
494 (
495 SELECT
496 is_anonymous_proxy
497 FROM
498 autnum_feeds feeds
499 WHERE
500 networks.autnum = feeds.number
501 ORDER BY
502 source
503 LIMIT 1
504 ),
505 (
506 SELECT
507 is_anonymous_proxy
508 FROM
509 autnum_overrides overrides
510 WHERE
511 networks.autnum = overrides.number
512 ),
513 FALSE
514 ) AS is_anonymous_proxy,
515 COALESCE(
516 (
517 SELECT
518 is_satellite_provider
519 FROM
520 network_overrides overrides
521 WHERE
522 networks.network <<= overrides.network
523 ORDER BY
524 masklen(overrides.network) DESC
525 LIMIT 1
526 ),
527 (
528 SELECT
529 is_satellite_provider
530 FROM
531 network_feeds feeds
532 WHERE
533 networks.network <<= feeds.network
534 ORDER BY
535 masklen(feeds.network) DESC
536 LIMIT 1
537 ),
538 (
539 SELECT
540 is_satellite_provider
541 FROM
542 autnum_feeds feeds
543 WHERE
544 networks.autnum = feeds.number
545 ORDER BY
546 source
547 LIMIT 1
548 ),
549 (
550 SELECT
551 is_satellite_provider
552 FROM
553 autnum_overrides overrides
554 WHERE
555 networks.autnum = overrides.number
556 ),
557 FALSE
558 ) AS is_satellite_provider,
559 COALESCE(
560 (
561 SELECT
562 is_anycast
563 FROM
564 network_overrides overrides
565 WHERE
566 networks.network <<= overrides.network
567 ORDER BY
568 masklen(overrides.network) DESC
569 LIMIT 1
570 ),
571 (
572 SELECT
573 is_anycast
574 FROM
575 network_feeds feeds
576 WHERE
577 networks.network <<= feeds.network
578 ORDER BY
579 masklen(feeds.network) DESC
580 LIMIT 1
581 ),
582 (
583 SELECT
584 is_anycast
585 FROM
586 autnum_feeds feeds
587 WHERE
588 networks.autnum = feeds.number
589 ORDER BY
590 source
591 LIMIT 1
592 ),
593 (
594 SELECT
595 is_anycast
596 FROM
597 autnum_overrides overrides
598 WHERE
599 networks.autnum = overrides.number
600 ),
601 FALSE
602 ) AS is_anycast,
603 COALESCE(
604 (
605 SELECT
606 is_drop
607 FROM
608 network_overrides overrides
609 WHERE
610 networks.network <<= overrides.network
611 ORDER BY
612 masklen(overrides.network) DESC
613 LIMIT 1
614 ),
615 (
616 SELECT
617 is_drop
618 FROM
619 network_feeds feeds
620 WHERE
621 networks.network <<= feeds.network
622 ORDER BY
623 masklen(feeds.network) DESC
624 LIMIT 1
625 ),
626 (
627 SELECT
628 is_drop
629 FROM
630 autnum_feeds feeds
631 WHERE
632 networks.autnum = feeds.number
633 ORDER BY
634 source
635 LIMIT 1
636 ),
637 (
638 SELECT
639 is_drop
640 FROM
641 autnum_overrides overrides
642 WHERE
643 networks.autnum = overrides.number
644 ),
645 FALSE
646 ) AS is_drop
647 FROM
648 ordered_networks networks
649 """)
650
651 for row in rows:
652 network = writer.add_network(row.network)
653
654 # Save country
655 if row.country:
656 network.country_code = row.country
657
658 # Save ASN
659 if row.autnum:
660 network.asn = row.autnum
661
662 # Set flags
663 if row.is_anonymous_proxy:
664 network.set_flag(location.NETWORK_FLAG_ANONYMOUS_PROXY)
665
666 if row.is_satellite_provider:
667 network.set_flag(location.NETWORK_FLAG_SATELLITE_PROVIDER)
668
669 if row.is_anycast:
670 network.set_flag(location.NETWORK_FLAG_ANYCAST)
671
672 if row.is_drop:
673 network.set_flag(location.NETWORK_FLAG_DROP)
674
675 # Add all countries
676 log.info("Writing countries...")
677 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
678
679 for row in rows:
680 c = writer.add_country(row.country_code)
681 c.continent_code = row.continent_code
682 c.name = row.name
683
684 # Write everything to file
685 log.info("Writing database to file...")
686 for file in ns.file:
687 writer.write(file)
688
689 def handle_update_whois(self, ns):
690 downloader = location.importer.Downloader()
691
692 # Did we run successfully?
693 error = False
694
695 # Fetch all valid country codes to check parsed networks against
696 countries = self.fetch_countries()
697
698 # Check if we have countries
699 if not countries:
700 log.error("Please import countries before importing any WHOIS data")
701 return 1
702
703 # Iterate over all potential sources
704 for source in sorted(location.importer.SOURCES):
705 with self.db.transaction():
706 # Create some temporary tables to store parsed data
707 self.db.execute("""
708 CREATE TEMPORARY TABLE _autnums(number integer NOT NULL,
709 organization text NOT NULL, source text NOT NULL) ON COMMIT DROP;
710 CREATE UNIQUE INDEX _autnums_number ON _autnums(number);
711
712 CREATE TEMPORARY TABLE _organizations(handle text NOT NULL,
713 name text NOT NULL, source text NOT NULL) ON COMMIT DROP;
714 CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle);
715
716 CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text NOT NULL,
717 original_countries text[] NOT NULL, source text NOT NULL)
718 ON COMMIT DROP;
719 CREATE INDEX _rirdata_search ON _rirdata
720 USING BTREE(family(network), masklen(network));
721 CREATE UNIQUE INDEX _rirdata_network ON _rirdata(network);
722 """)
723
724 # Remove all previously imported content
725 self.db.execute("DELETE FROM autnums WHERE source = %s", source)
726 self.db.execute("DELETE FROM networks WHERE source = %s", source)
727
728 try:
729 # Fetch WHOIS sources
730 for url in location.importer.WHOIS_SOURCES.get(source, []):
731 for block in downloader.request_blocks(url):
732 self._parse_block(block, source, countries)
733
734 # Fetch extended sources
735 for url in location.importer.EXTENDED_SOURCES.get(source, []):
736 for line in downloader.request_lines(url):
737 self._parse_line(line, source, countries)
738 except urllib.error.URLError as e:
739 log.error("Could not retrieve data from %s: %s" % (source, e))
740 error = True
741
742 # Continue with the next source
743 continue
744
745 # Process all parsed networks from every RIR we happen to have access to,
746 # insert the largest network chunks into the networks table immediately...
747 families = self.db.query("""
748 SELECT DISTINCT
749 family(network) AS family
750 FROM
751 _rirdata
752 ORDER BY
753 family(network)
754 """,
755 )
756
757 for family in (row.family for row in families):
758 # Fetch the smallest mask length in our data set
759 smallest = self.db.get("""
760 SELECT
761 MIN(
762 masklen(network)
763 ) AS prefix
764 FROM
765 _rirdata
766 WHERE
767 family(network) = %s
768 """, family,
769 )
770
771 # Copy all networks
772 self.db.execute("""
773 INSERT INTO
774 networks
775 (
776 network,
777 country,
778 original_countries,
779 source
780 )
781 SELECT
782 network,
783 country,
784 original_countries,
785 source
786 FROM
787 _rirdata
788 WHERE
789 masklen(network) = %s
790 AND
791 family(network) = %s
792 ON CONFLICT DO
793 NOTHING""",
794 smallest.prefix,
795 family,
796 )
797
798 # ... determine any other prefixes for this network family, ...
799 prefixes = self.db.query("""
800 SELECT
801 DISTINCT masklen(network) AS prefix
802 FROM
803 _rirdata
804 WHERE
805 family(network) = %s
806 ORDER BY
807 masklen(network) ASC
808 OFFSET 1
809 """, family,
810 )
811
812 # ... and insert networks with this prefix in case they provide additional
813 # information (i. e. subnet of a larger chunk with a different country)
814 for prefix in (row.prefix for row in prefixes):
815 self.db.execute("""
816 WITH candidates AS (
817 SELECT
818 _rirdata.network,
819 _rirdata.country,
820 _rirdata.original_countries,
821 _rirdata.source
822 FROM
823 _rirdata
824 WHERE
825 family(_rirdata.network) = %s
826 AND
827 masklen(_rirdata.network) = %s
828 ),
829 filtered AS (
830 SELECT
831 DISTINCT ON (c.network)
832 c.network,
833 c.country,
834 c.original_countries,
835 c.source,
836 masklen(networks.network),
837 networks.country AS parent_country
838 FROM
839 candidates c
840 LEFT JOIN
841 networks
842 ON
843 c.network << networks.network
844 ORDER BY
845 c.network,
846 masklen(networks.network) DESC NULLS LAST
847 )
848 INSERT INTO
849 networks(network, country, original_countries, source)
850 SELECT
851 network,
852 country,
853 original_countries,
854 source
855 FROM
856 filtered
857 WHERE
858 parent_country IS NULL
859 OR
860 country <> parent_country
861 ON CONFLICT DO NOTHING
862 """, family, prefix,
863 )
864
865 self.db.execute("""
866 INSERT INTO
867 autnums
868 (
869 number,
870 name,
871 source
872 )
873 SELECT
874 _autnums.number,
875 _organizations.name,
876 _organizations.source
877 FROM
878 _autnums
879 JOIN
880 _organizations ON _autnums.organization = _organizations.handle
881 ON CONFLICT
882 (
883 number
884 )
885 DO UPDATE
886 SET name = excluded.name
887 """,
888 )
889
890 # Download and import (technical) AS names from ARIN
891 with self.db.transaction():
892 self._import_as_names_from_arin(downloader)
893
894 # Return a non-zero exit code for errors
895 return 1 if error else 0
896
897 def _check_parsed_network(self, network):
898 """
899 Assistive function to detect and subsequently sort out parsed
900 networks from RIR data (both Whois and so-called "extended sources"),
901 which are or have...
902
903 (a) not globally routable (RFC 1918 space, et al.)
904 (b) covering a too large chunk of the IP address space (prefix length
905 is < 7 for IPv4 networks, and < 10 for IPv6)
906 (c) "0.0.0.0" or "::" as a network address
907 (d) are too small for being publicly announced (we have decided not to
908 process them at the moment, as they significantly enlarge our
909 database without providing very helpful additional information)
910
911 This unfortunately is necessary due to brain-dead clutter across
912 various RIR databases, causing mismatches and eventually disruptions.
913
914 We will return False in case a network is not suitable for adding
915 it to our database, and True otherwise.
916 """
917
918 if not network or not (isinstance(network, ipaddress.IPv4Network) or isinstance(network, ipaddress.IPv6Network)):
919 return False
920
921 if not network.is_global:
922 log.debug("Skipping non-globally routable network: %s" % network)
923 return False
924
925 if network.version == 4:
926 if network.prefixlen < 7:
927 log.debug("Skipping too big IP chunk: %s" % network)
928 return False
929
930 if network.prefixlen > 24:
931 log.debug("Skipping network too small to be publicly announced: %s" % network)
932 return False
933
934 if str(network.network_address) == "0.0.0.0":
935 log.debug("Skipping network based on 0.0.0.0: %s" % network)
936 return False
937
938 elif network.version == 6:
939 if network.prefixlen < 10:
940 log.debug("Skipping too big IP chunk: %s" % network)
941 return False
942
943 if network.prefixlen > 48:
944 log.debug("Skipping network too small to be publicly announced: %s" % network)
945 return False
946
947 if str(network.network_address) == "::":
948 log.debug("Skipping network based on '::': %s" % network)
949 return False
950
951 else:
952 # This should not happen...
953 log.warning("Skipping network of unknown family, this should not happen: %s" % network)
954 return False
955
956 # In case we have made it here, the network is considered to
957 # be suitable for libloc consumption...
958 return True
959
960 def _check_parsed_asn(self, asn):
961 """
962 Assistive function to filter Autonomous System Numbers not being suitable
963 for adding to our database. Returns False in such cases, and True otherwise.
964 """
965
966 for start, end in VALID_ASN_RANGES:
967 if start <= asn and end >= asn:
968 return True
969
970 log.info("Supplied ASN %s out of publicly routable ASN ranges" % asn)
971 return False
972
973 def _parse_block(self, block, source_key, countries):
974 # Get first line to find out what type of block this is
975 line = block[0]
976
977 # aut-num
978 if line.startswith("aut-num:"):
979 return self._parse_autnum_block(block, source_key)
980
981 # inetnum
982 if line.startswith("inet6num:") or line.startswith("inetnum:"):
983 return self._parse_inetnum_block(block, source_key, countries)
984
985 # organisation
986 elif line.startswith("organisation:"):
987 return self._parse_org_block(block, source_key)
988
989 def _parse_autnum_block(self, block, source_key):
990 autnum = {}
991 for line in block:
992 # Split line
993 key, val = split_line(line)
994
995 if key == "aut-num":
996 m = re.match(r"^(AS|as)(\d+)", val)
997 if m:
998 autnum["asn"] = m.group(2)
999
1000 elif key == "org":
1001 autnum[key] = val.upper()
1002
1003 elif key == "descr":
1004 # Save the first description line as well...
1005 if not key in autnum:
1006 autnum[key] = val
1007
1008 # Skip empty objects
1009 if not autnum or not "asn" in autnum:
1010 return
1011
1012 # Insert a dummy organisation handle into our temporary organisations
1013 # table in case the AS does not have an organisation handle set, but
1014 # has a description (a quirk often observed in APNIC area), so we can
1015 # later display at least some string for this AS.
1016 if not "org" in autnum:
1017 if "descr" in autnum:
1018 autnum["org"] = "LIBLOC-%s-ORGHANDLE" % autnum.get("asn")
1019
1020 self.db.execute("INSERT INTO _organizations(handle, name, source) \
1021 VALUES(%s, %s, %s) ON CONFLICT (handle) DO NOTHING",
1022 autnum.get("org"), autnum.get("descr"), source_key,
1023 )
1024 else:
1025 log.warning("ASN %s neither has an organisation handle nor a description line set, omitting" % \
1026 autnum.get("asn"))
1027 return
1028
1029 # Insert into database
1030 self.db.execute("INSERT INTO _autnums(number, organization, source) \
1031 VALUES(%s, %s, %s) ON CONFLICT (number) DO UPDATE SET \
1032 organization = excluded.organization",
1033 autnum.get("asn"), autnum.get("org"), source_key,
1034 )
1035
1036 def _parse_inetnum_block(self, block, source_key, countries):
1037 log.debug("Parsing inetnum block:")
1038
1039 inetnum = {}
1040 for line in block:
1041 log.debug(line)
1042
1043 # Split line
1044 key, val = split_line(line)
1045
1046 # Filter any inetnum records which are only referring to IP space
1047 # not managed by that specific RIR...
1048 if key == "netname":
1049 if re.match(r"^(ERX-NETBLOCK|(AFRINIC|ARIN|LACNIC|RIPE)-CIDR-BLOCK|IANA-NETBLOCK-\d{1,3}|NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK|STUB-[\d-]{3,}SLASH\d{1,2})", val.strip()):
1050 log.debug("Skipping record indicating historic/orphaned data: %s" % val.strip())
1051 return
1052
1053 if key == "inetnum":
1054 start_address, delim, end_address = val.partition("-")
1055
1056 # Strip any excess space
1057 start_address, end_address = start_address.rstrip(), end_address.strip()
1058
1059 # Handle "inetnum" formatting in LACNIC DB (e.g. "24.152.8/22" instead of "24.152.8.0/22")
1060 if start_address and not (delim or end_address):
1061 try:
1062 start_address = ipaddress.ip_network(start_address, strict=False)
1063 except ValueError:
1064 start_address = start_address.split("/")
1065 ldigits = start_address[0].count(".")
1066
1067 # How many octets do we need to add?
1068 # (LACNIC does not seem to have a /8 or greater assigned, so the following should suffice.)
1069 if ldigits == 1:
1070 start_address = start_address[0] + ".0.0/" + start_address[1]
1071 elif ldigits == 2:
1072 start_address = start_address[0] + ".0/" + start_address[1]
1073 else:
1074 log.warning("Could not recover IPv4 address from line in LACNIC DB format: %s" % line)
1075 return
1076
1077 try:
1078 start_address = ipaddress.ip_network(start_address, strict=False)
1079 except ValueError:
1080 log.warning("Could not parse line in LACNIC DB format: %s" % line)
1081 return
1082
1083 # Enumerate first and last IP address of this network
1084 end_address = start_address[-1]
1085 start_address = start_address[0]
1086
1087 else:
1088 # Convert to IP address
1089 try:
1090 start_address = ipaddress.ip_address(start_address)
1091 end_address = ipaddress.ip_address(end_address)
1092 except ValueError:
1093 log.warning("Could not parse line: %s" % line)
1094 return
1095
1096 inetnum["inetnum"] = list(ipaddress.summarize_address_range(start_address, end_address))
1097
1098 elif key == "inet6num":
1099 inetnum[key] = [ipaddress.ip_network(val, strict=False)]
1100
1101 elif key == "country":
1102 val = val.upper()
1103
1104 # Catch RIR data objects with more than one country code...
1105 if not key in inetnum:
1106 inetnum[key] = []
1107 else:
1108 if val in inetnum.get("country"):
1109 # ... but keep this list distinct...
1110 continue
1111
1112 # Ignore certain country codes
1113 if val in IGNORED_COUNTRIES:
1114 log.debug("Ignoring country code '%s'" % val)
1115 continue
1116
1117 # Translate country codes
1118 try:
1119 val = TRANSLATED_COUNTRIES[val]
1120 except KeyError:
1121 pass
1122
1123 inetnum[key].append(val)
1124
1125 # Parse the geofeed attribute
1126 elif key == "geofeed":
1127 inetnum["geofeed"] = val
1128
1129 # Parse geofeed when used as a remark
1130 elif key == "remarks":
1131 m = re.match(r"^(?:Geofeed)\s+(https://.*)", val)
1132 if m:
1133 inetnum["geofeed"] = m.group(1)
1134
1135 # Skip empty objects
1136 if not inetnum or not "country" in inetnum:
1137 return
1138
1139 # Prepare skipping objects with unknown country codes...
1140 invalidcountries = [singlecountry for singlecountry in inetnum.get("country") if singlecountry not in countries]
1141
1142 # Iterate through all networks enumerated from above, check them for plausibility and insert
1143 # them into the database, if _check_parsed_network() succeeded
1144 for single_network in inetnum.get("inet6num") or inetnum.get("inetnum"):
1145 if self._check_parsed_network(single_network):
1146 # Skip objects with unknown country codes if they are valid to avoid log spam...
1147 if invalidcountries:
1148 log.warning("Skipping network with bogus countr(y|ies) %s (original countries: %s): %s" % \
1149 (invalidcountries, inetnum.get("country"), inetnum.get("inet6num") or inetnum.get("inetnum")))
1150 break
1151
1152 # Everything is fine here, run INSERT statement...
1153 self.db.execute("INSERT INTO _rirdata(network, country, original_countries, source) \
1154 VALUES(%s, %s, %s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
1155 "%s" % single_network, inetnum.get("country")[0], inetnum.get("country"), source_key,
1156 )
1157
1158 # Update any geofeed information
1159 geofeed = inetnum.get("geofeed", None)
1160 if geofeed:
1161 self._parse_geofeed(geofeed, single_network)
1162
1163 # Delete any previous geofeeds
1164 else:
1165 self.db.execute("DELETE FROM network_geofeeds WHERE network = %s",
1166 "%s" % single_network)
1167
1168 def _parse_geofeed(self, url, single_network):
1169 # Parse the URL
1170 url = urllib.parse.urlparse(url)
1171
1172 # Make sure that this is a HTTPS URL
1173 if not url.scheme == "https":
1174 log.debug("Geofeed URL is not using HTTPS: %s" % geofeed)
1175 return
1176
1177 # Put the URL back together normalized
1178 url = url.geturl()
1179
1180 # Store/update any geofeeds
1181 self.db.execute("""
1182 INSERT INTO
1183 network_geofeeds(
1184 network,
1185 url
1186 )
1187 VALUES(
1188 %s, %s
1189 )
1190 ON CONFLICT (network) DO
1191 UPDATE SET url = excluded.url""",
1192 "%s" % single_network, url,
1193 )
1194
1195 def _parse_org_block(self, block, source_key):
1196 org = {}
1197 for line in block:
1198 # Split line
1199 key, val = split_line(line)
1200
1201 if key == "organisation":
1202 org[key] = val.upper()
1203 elif key == "org-name":
1204 org[key] = val
1205
1206 # Skip empty objects
1207 if not org:
1208 return
1209
1210 self.db.execute("INSERT INTO _organizations(handle, name, source) \
1211 VALUES(%s, %s, %s) ON CONFLICT (handle) DO \
1212 UPDATE SET name = excluded.name",
1213 org.get("organisation"), org.get("org-name"), source_key,
1214 )
1215
1216 def _parse_line(self, line, source_key, validcountries=None):
1217 # Skip version line
1218 if line.startswith("2"):
1219 return
1220
1221 # Skip comments
1222 if line.startswith("#"):
1223 return
1224
1225 try:
1226 registry, country_code, type, line = line.split("|", 3)
1227 except:
1228 log.warning("Could not parse line: %s" % line)
1229 return
1230
1231 # Skip any unknown protocols
1232 if not type in ("ipv6", "ipv4"):
1233 log.warning("Unknown IP protocol '%s'" % type)
1234 return
1235
1236 # Skip any lines that are for stats only or do not have a country
1237 # code at all (avoids log spam below)
1238 if not country_code or country_code == '*':
1239 return
1240
1241 # Skip objects with unknown country codes
1242 if validcountries and country_code not in validcountries:
1243 log.warning("Skipping line with bogus country '%s': %s" % \
1244 (country_code, line))
1245 return
1246
1247 try:
1248 address, prefix, date, status, organization = line.split("|")
1249 except ValueError:
1250 organization = None
1251
1252 # Try parsing the line without organization
1253 try:
1254 address, prefix, date, status = line.split("|")
1255 except ValueError:
1256 log.warning("Unhandled line format: %s" % line)
1257 return
1258
1259 # Skip anything that isn't properly assigned
1260 if not status in ("assigned", "allocated"):
1261 return
1262
1263 # Cast prefix into an integer
1264 try:
1265 prefix = int(prefix)
1266 except:
1267 log.warning("Invalid prefix: %s" % prefix)
1268 return
1269
1270 # Fix prefix length for IPv4
1271 if type == "ipv4":
1272 prefix = 32 - int(math.log(prefix, 2))
1273
1274 # Try to parse the address
1275 try:
1276 network = ipaddress.ip_network("%s/%s" % (address, prefix), strict=False)
1277 except ValueError:
1278 log.warning("Invalid IP address: %s" % address)
1279 return
1280
1281 if not self._check_parsed_network(network):
1282 return
1283
1284 self.db.execute("""
1285 INSERT INTO
1286 networks
1287 (
1288 network,
1289 country,
1290 original_countries,
1291 source
1292 )
1293 VALUES
1294 (
1295 %s, %s, %s, %s
1296 )
1297 ON CONFLICT (network)
1298 DO UPDATE SET country = excluded.country
1299 """, "%s" % network, country_code, [country], source_key,
1300 )
1301
1302 def _import_as_names_from_arin(self, downloader):
1303 # Delete all previously imported content
1304 self.db.execute("DELETE FROM autnums WHERE source = %s", "ARIN")
1305
1306 # Try to retrieve the feed from ftp.arin.net
1307 feed = downloader.request_lines("https://ftp.arin.net/pub/resource_registry_service/asns.csv")
1308
1309 # Walk through the file
1310 for line in csv.DictReader(feed, dialect="arin"):
1311 log.debug("Processing object: %s" % line)
1312
1313 # Fetch status
1314 status = line.get("Status")
1315
1316 # We are only interested in anything managed by ARIN
1317 if not status == "Full Registry Services":
1318 continue
1319
1320 # Fetch organization name
1321 name = line.get("Org Name")
1322
1323 # Extract ASNs
1324 first_asn = line.get("Start AS Number")
1325 last_asn = line.get("End AS Number")
1326
1327 # Cast to a number
1328 try:
1329 first_asn = int(first_asn)
1330 except TypeError as e:
1331 log.warning("Could not parse ASN '%s'" % first_asn)
1332 continue
1333
1334 try:
1335 last_asn = int(last_asn)
1336 except TypeError as e:
1337 log.warning("Could not parse ASN '%s'" % last_asn)
1338 continue
1339
1340 # Check if the range is valid
1341 if last_asn < first_asn:
1342 log.warning("Invalid ASN range %s-%s" % (first_asn, last_asn))
1343
1344 # Insert everything into the database
1345 for asn in range(first_asn, last_asn + 1):
1346 if not self._check_parsed_asn(asn):
1347 log.warning("Skipping invalid ASN %s" % asn)
1348 continue
1349
1350 self.db.execute("""
1351 INSERT INTO
1352 autnums
1353 (
1354 number,
1355 name,
1356 source
1357 )
1358 VALUES
1359 (
1360 %s, %s, %s
1361 )
1362 ON CONFLICT
1363 (
1364 number
1365 )
1366 DO NOTHING
1367 """, asn, name, "ARIN",
1368 )
1369
1370 def handle_update_announcements(self, ns):
1371 server = ns.server[0]
1372
1373 with self.db.transaction():
1374 if server.startswith("/"):
1375 self._handle_update_announcements_from_bird(server)
1376
1377 # Purge anything we never want here
1378 self.db.execute("""
1379 -- Delete default routes
1380 DELETE FROM announcements WHERE network = '::/0' OR network = '0.0.0.0/0';
1381
1382 -- Delete anything that is not global unicast address space
1383 DELETE FROM announcements WHERE family(network) = 6 AND NOT network <<= '2000::/3';
1384
1385 -- DELETE "current network" address space
1386 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '0.0.0.0/8';
1387
1388 -- DELETE local loopback address space
1389 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '127.0.0.0/8';
1390
1391 -- DELETE RFC 1918 address space
1392 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '10.0.0.0/8';
1393 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '172.16.0.0/12';
1394 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.168.0.0/16';
1395
1396 -- DELETE test, benchmark and documentation address space
1397 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.0.0/24';
1398 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.2.0/24';
1399 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.18.0.0/15';
1400 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.51.100.0/24';
1401 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '203.0.113.0/24';
1402
1403 -- DELETE CGNAT address space (RFC 6598)
1404 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '100.64.0.0/10';
1405
1406 -- DELETE link local address space
1407 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '169.254.0.0/16';
1408
1409 -- DELETE IPv6 to IPv4 (6to4) address space (RFC 3068)
1410 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.88.99.0/24';
1411 DELETE FROM announcements WHERE family(network) = 6 AND network <<= '2002::/16';
1412
1413 -- DELETE multicast and reserved address space
1414 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '224.0.0.0/4';
1415 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '240.0.0.0/4';
1416
1417 -- Delete networks that are too small to be in the global routing table
1418 DELETE FROM announcements WHERE family(network) = 6 AND masklen(network) > 48;
1419 DELETE FROM announcements WHERE family(network) = 4 AND masklen(network) > 24;
1420
1421 -- Delete any non-public or reserved ASNs
1422 DELETE FROM announcements WHERE NOT (
1423 (autnum >= 1 AND autnum <= 23455)
1424 OR
1425 (autnum >= 23457 AND autnum <= 64495)
1426 OR
1427 (autnum >= 131072 AND autnum <= 4199999999)
1428 );
1429
1430 -- Delete everything that we have not seen for 14 days
1431 DELETE FROM announcements WHERE last_seen_at <= CURRENT_TIMESTAMP - INTERVAL '14 days';
1432 """)
1433
1434 def _handle_update_announcements_from_bird(self, server):
1435 # Pre-compile the regular expression for faster searching
1436 route = re.compile(b"^\s(.+?)\s+.+?\[(?:AS(.*?))?.\]$")
1437
1438 log.info("Requesting routing table from Bird (%s)" % server)
1439
1440 aggregated_networks = []
1441
1442 # Send command to list all routes
1443 for line in self._bird_cmd(server, "show route"):
1444 m = route.match(line)
1445 if not m:
1446 # Skip empty lines
1447 if not line:
1448 pass
1449
1450 # Ignore any header lines with the name of the routing table
1451 elif line.startswith(b"Table"):
1452 pass
1453
1454 # Log anything else
1455 else:
1456 log.debug("Could not parse line: %s" % line.decode())
1457
1458 continue
1459
1460 # Fetch the extracted network and ASN
1461 network, autnum = m.groups()
1462
1463 # Decode into strings
1464 if network:
1465 network = network.decode()
1466 if autnum:
1467 autnum = autnum.decode()
1468
1469 # Collect all aggregated networks
1470 if not autnum:
1471 log.debug("%s is an aggregated network" % network)
1472 aggregated_networks.append(network)
1473 continue
1474
1475 # Insert it into the database
1476 self.db.execute("INSERT INTO announcements(network, autnum) \
1477 VALUES(%s, %s) ON CONFLICT (network) DO \
1478 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1479 network, autnum,
1480 )
1481
1482 # Process any aggregated networks
1483 for network in aggregated_networks:
1484 log.debug("Processing aggregated network %s" % network)
1485
1486 # Run "show route all" for each network
1487 for line in self._bird_cmd(server, "show route %s all" % network):
1488 # Try finding the path
1489 m = re.match(b"\s+BGP\.as_path:.* (\d+) {\d+}$", line)
1490 if m:
1491 # Select the last AS number in the path
1492 autnum = m.group(1).decode()
1493
1494 # Insert it into the database
1495 self.db.execute("INSERT INTO announcements(network, autnum) \
1496 VALUES(%s, %s) ON CONFLICT (network) DO \
1497 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1498 network, autnum,
1499 )
1500
1501 # We don't need to process any more
1502 break
1503
1504 def _bird_cmd(self, socket_path, command):
1505 # Connect to the socket
1506 s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
1507 s.connect(socket_path)
1508
1509 # Allocate some buffer
1510 buffer = b""
1511
1512 log.debug("Sending Bird command: %s" % command)
1513
1514 # Send the command
1515 s.send(b"%s\n" % command.encode())
1516
1517 while True:
1518 # Fill up the buffer
1519 buffer += s.recv(4096)
1520
1521 while True:
1522 # Search for the next newline
1523 pos = buffer.find(b"\n")
1524
1525 # If we cannot find one, we go back and read more data
1526 if pos <= 0:
1527 break
1528
1529 # Cut after the newline character
1530 pos += 1
1531
1532 # Split the line we want and keep the rest in buffer
1533 line, buffer = buffer[:pos], buffer[pos:]
1534
1535 # Try parsing any status lines
1536 if len(line) > 4 and line[:4].isdigit() and line[4] in (32, 45):
1537 code, delim, line = int(line[:4]), line[4], line[5:]
1538
1539 log.debug("Received response code %s from bird" % code)
1540
1541 # End of output
1542 if code == 0:
1543 return
1544
1545 # Ignore hello line
1546 elif code == 1:
1547 continue
1548
1549 # Otherwise return the line
1550 yield line
1551
1552 def handle_update_geofeeds(self, ns):
1553 # Sync geofeeds
1554 with self.db.transaction():
1555 # Delete all geofeeds which are no longer linked
1556 self.db.execute("""
1557 DELETE FROM
1558 geofeeds
1559 WHERE
1560 NOT EXISTS (
1561 SELECT
1562 1
1563 FROM
1564 network_geofeeds
1565 WHERE
1566 geofeeds.url = network_geofeeds.url
1567 )""",
1568 )
1569
1570 # Copy all geofeeds
1571 self.db.execute("""
1572 INSERT INTO
1573 geofeeds(
1574 url
1575 )
1576 SELECT
1577 url
1578 FROM
1579 network_geofeeds
1580 ON CONFLICT (url)
1581 DO NOTHING
1582 """,
1583 )
1584
1585 # Fetch all Geofeeds that require an update
1586 geofeeds = self.db.query("""
1587 SELECT
1588 id,
1589 url
1590 FROM
1591 geofeeds
1592 WHERE
1593 updated_at IS NULL
1594 OR
1595 updated_at <= CURRENT_TIMESTAMP - INTERVAL '1 week'
1596 ORDER BY
1597 id
1598 """)
1599
1600 # Create a downloader
1601 downloader = location.importer.Downloader()
1602
1603 # Pass the downloader to the fetch_geofeed function
1604 fetch_geofeed = functools.partial(self._fetch_geofeed, downloader)
1605
1606 with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
1607 results = executor.map(fetch_geofeed, geofeeds)
1608
1609 # Fetch all results to raise any exceptions
1610 for result in results:
1611 pass
1612
1613 # Delete data from any feeds that did not update in the last two weeks
1614 with self.db.transaction():
1615 self.db.execute("""
1616 DELETE FROM
1617 geofeed_networks
1618 WHERE
1619 geofeed_networks.geofeed_id IN (
1620 SELECT
1621 geofeeds.id
1622 FROM
1623 geofeeds
1624 WHERE
1625 updated_at IS NULL
1626 OR
1627 updated_at <= CURRENT_TIMESTAMP - INTERVAL '2 weeks'
1628 )
1629 """)
1630
1631 def _fetch_geofeed(self, downloader, geofeed):
1632 log.debug("Fetching Geofeed %s" % geofeed.url)
1633
1634 with self.db.transaction():
1635 # Open the URL
1636 try:
1637 # Send the request
1638 f = downloader.retrieve(geofeed.url, headers={
1639 "User-Agent" : "location/%s" % location.__version__,
1640
1641 # We expect some plain text file in CSV format
1642 "Accept" : "text/csv, text/plain",
1643 })
1644
1645 # Remove any previous data
1646 self.db.execute("DELETE FROM geofeed_networks \
1647 WHERE geofeed_id = %s", geofeed.id)
1648
1649 lineno = 0
1650
1651 # Read the output line by line
1652 for line in f:
1653 lineno += 1
1654
1655 try:
1656 line = line.decode()
1657
1658 # Ignore any lines we cannot decode
1659 except UnicodeDecodeError:
1660 log.debug("Could not decode line %s in %s" \
1661 % (lineno, geofeed.url))
1662 continue
1663
1664 # Strip any newline
1665 line = line.rstrip()
1666
1667 # Skip empty lines
1668 if not line:
1669 continue
1670
1671 # Try to parse the line
1672 try:
1673 fields = line.split(",", 5)
1674 except ValueError:
1675 log.debug("Could not parse line: %s" % line)
1676 continue
1677
1678 # Check if we have enough fields
1679 if len(fields) < 4:
1680 log.debug("Not enough fields in line: %s" % line)
1681 continue
1682
1683 # Fetch all fields
1684 network, country, region, city, = fields[:4]
1685
1686 # Try to parse the network
1687 try:
1688 network = ipaddress.ip_network(network, strict=False)
1689 except ValueError:
1690 log.debug("Could not parse network: %s" % network)
1691 continue
1692
1693 # Strip any excess whitespace from country codes
1694 country = country.strip()
1695
1696 # Make the country code uppercase
1697 country = country.upper()
1698
1699 # Check the country code
1700 if not country:
1701 log.debug("Empty country code in Geofeed %s line %s" \
1702 % (geofeed.url, lineno))
1703 continue
1704
1705 elif not location.country_code_is_valid(country):
1706 log.debug("Invalid country code in Geofeed %s:%s: %s" \
1707 % (geofeed.url, lineno, country))
1708 continue
1709
1710 # Write this into the database
1711 self.db.execute("""
1712 INSERT INTO
1713 geofeed_networks (
1714 geofeed_id,
1715 network,
1716 country,
1717 region,
1718 city
1719 )
1720 VALUES (%s, %s, %s, %s, %s)""",
1721 geofeed.id,
1722 "%s" % network,
1723 country,
1724 region,
1725 city,
1726 )
1727
1728 # Catch any HTTP errors
1729 except urllib.request.HTTPError as e:
1730 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1731 WHERE id = %s", e.code, "%s" % e, geofeed.id)
1732
1733 # Remove any previous data when the feed has been deleted
1734 if e.code == 404:
1735 self.db.execute("DELETE FROM geofeed_networks \
1736 WHERE geofeed_id = %s", geofeed.id)
1737
1738 # Catch any other errors and connection timeouts
1739 except (http.client.InvalidURL, urllib.request.URLError, TimeoutError) as e:
1740 log.debug("Could not fetch URL %s: %s" % (geofeed.url, e))
1741
1742 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1743 WHERE id = %s", 599, "%s" % e, geofeed.id)
1744
1745 # Mark the geofeed as updated
1746 else:
1747 self.db.execute("""
1748 UPDATE
1749 geofeeds
1750 SET
1751 updated_at = CURRENT_TIMESTAMP,
1752 status = NULL,
1753 error = NULL
1754 WHERE
1755 id = %s""",
1756 geofeed.id,
1757 )
1758
1759 def handle_update_overrides(self, ns):
1760 with self.db.transaction():
1761 # Drop any previous content
1762 self.db.execute("TRUNCATE TABLE autnum_overrides")
1763 self.db.execute("TRUNCATE TABLE network_overrides")
1764
1765 for file in ns.files:
1766 log.info("Reading %s..." % file)
1767
1768 with open(file, "rb") as f:
1769 for type, block in location.importer.read_blocks(f):
1770 if type == "net":
1771 network = block.get("net")
1772 # Try to parse and normalise the network
1773 try:
1774 network = ipaddress.ip_network(network, strict=False)
1775 except ValueError as e:
1776 log.warning("Invalid IP network: %s: %s" % (network, e))
1777 continue
1778
1779 # Prevent that we overwrite all networks
1780 if network.prefixlen == 0:
1781 log.warning("Skipping %s: You cannot overwrite default" % network)
1782 continue
1783
1784 self.db.execute("""
1785 INSERT INTO
1786 network_overrides
1787 (
1788 network,
1789 country,
1790 is_anonymous_proxy,
1791 is_satellite_provider,
1792 is_anycast,
1793 is_drop
1794 )
1795 VALUES
1796 (
1797 %s, %s, %s, %s, %s, %s
1798 )
1799 ON CONFLICT (network) DO NOTHING
1800 """,
1801 "%s" % network,
1802 block.get("country"),
1803 self._parse_bool(block, "is-anonymous-proxy"),
1804 self._parse_bool(block, "is-satellite-provider"),
1805 self._parse_bool(block, "is-anycast"),
1806 self._parse_bool(block, "drop"),
1807 )
1808
1809 elif type == "aut-num":
1810 autnum = block.get("aut-num")
1811
1812 # Check if AS number begins with "AS"
1813 if not autnum.startswith("AS"):
1814 log.warning("Invalid AS number: %s" % autnum)
1815 continue
1816
1817 # Strip "AS"
1818 autnum = autnum[2:]
1819
1820 self.db.execute("""
1821 INSERT INTO
1822 autnum_overrides
1823 (
1824 number,
1825 name,
1826 country,
1827 is_anonymous_proxy,
1828 is_satellite_provider,
1829 is_anycast,
1830 is_drop
1831 )
1832 VALUES
1833 (
1834 %s, %s, %s, %s, %s, %s, %s
1835 )
1836 ON CONFLICT (number) DO NOTHING
1837 """,
1838 autnum,
1839 block.get("name"),
1840 block.get("country"),
1841 self._parse_bool(block, "is-anonymous-proxy"),
1842 self._parse_bool(block, "is-satellite-provider"),
1843 self._parse_bool(block, "is-anycast"),
1844 self._parse_bool(block, "drop"),
1845 )
1846
1847 else:
1848 log.warning("Unsupported type: %s" % type)
1849
1850 def handle_update_feeds(self, ns):
1851 """
1852 Update any third-party feeds
1853 """
1854 success = True
1855
1856 # Create a downloader
1857 downloader = location.importer.Downloader()
1858
1859 feeds = (
1860 # AWS IP Ranges
1861 ("AWS-IP-RANGES", self._import_aws_ip_ranges, "https://ip-ranges.amazonaws.com/ip-ranges.json"),
1862
1863 # Spamhaus DROP
1864 ("SPAMHAUS-DROP", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/drop.txt"),
1865 ("SPAMHAUS-EDROP", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/edrop.txt"),
1866 ("SPAMHAUS-DROPV6", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/dropv6.txt"),
1867
1868 # Spamhaus ASNDROP
1869 ("SPAMHAUS-ASNDROP", self._import_spamhaus_asndrop, "https://www.spamhaus.org/drop/asndrop.json"),
1870 )
1871
1872 # Drop any data from feeds that we don't support (any more)
1873 with self.db.transaction():
1874 # Fetch the names of all feeds we support
1875 sources = [name for name, *rest in feeds]
1876
1877 self.db.execute("DELETE FROM autnum_feeds WHERE NOT source = ANY(%s)", sources)
1878 self.db.execute("DELETE FROM network_feeds WHERE NOT source = ANY(%s)", sources)
1879
1880 # Walk through all feeds
1881 for name, callback, url, *args in feeds:
1882 # Skip any feeds that were not requested on the command line
1883 if ns.feeds and not name in ns.feeds:
1884 continue
1885
1886 try:
1887 self._process_feed(downloader, name, callback, url, *args)
1888
1889 # Log an error but continue if an exception occurs
1890 except Exception as e:
1891 log.error("Error processing feed '%s': %s" % (name, e))
1892 success = False
1893
1894 # Return status
1895 return 0 if success else 1
1896
1897 def _process_feed(self, downloader, name, callback, url, *args):
1898 """
1899 Processes one feed
1900 """
1901 # Open the URL
1902 f = downloader.retrieve(url)
1903
1904 with self.db.transaction():
1905 # Drop any previous content
1906 self.db.execute("DELETE FROM autnum_feeds WHERE source = %s", name)
1907 self.db.execute("DELETE FROM network_feeds WHERE source = %s", name)
1908
1909 # Call the callback to process the feed
1910 return callback(name, f, *args)
1911
1912 def _import_aws_ip_ranges(self, name, f):
1913 # Parse the feed
1914 feed = json.load(f)
1915
1916 # Set up a dictionary for mapping a region name to a country. Unfortunately,
1917 # there seems to be no machine-readable version available of this other than
1918 # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html
1919 # (worse, it seems to be incomplete :-/ ); https://www.cloudping.cloud/endpoints
1920 # was helpful here as well.
1921 aws_region_country_map = {
1922 # Africa
1923 "af-south-1" : "ZA",
1924
1925 # Asia
1926 "il-central-1" : "IL", # Tel Aviv
1927
1928 # Asia/Pacific
1929 "ap-northeast-1" : "JP",
1930 "ap-northeast-2" : "KR",
1931 "ap-northeast-3" : "JP",
1932 "ap-east-1" : "HK",
1933 "ap-south-1" : "IN",
1934 "ap-south-2" : "IN",
1935 "ap-southeast-1" : "SG",
1936 "ap-southeast-2" : "AU",
1937 "ap-southeast-3" : "MY",
1938 "ap-southeast-4" : "AU",
1939 "ap-southeast-5" : "NZ", # Auckland, NZ
1940 "ap-southeast-6" : "AP", # XXX: Precise location not documented anywhere
1941
1942 # Canada
1943 "ca-central-1" : "CA",
1944 "ca-west-1" : "CA",
1945
1946 # Europe
1947 "eu-central-1" : "DE",
1948 "eu-central-2" : "CH",
1949 "eu-north-1" : "SE",
1950 "eu-west-1" : "IE",
1951 "eu-west-2" : "GB",
1952 "eu-west-3" : "FR",
1953 "eu-south-1" : "IT",
1954 "eu-south-2" : "ES",
1955
1956 # Middle East
1957 "me-central-1" : "AE",
1958 "me-south-1" : "BH",
1959
1960 # South America
1961 "sa-east-1" : "BR",
1962
1963 # Undocumented, likely located in Berlin rather than Frankfurt
1964 "eusc-de-east-1" : "DE",
1965 }
1966
1967 # Collect a list of all networks
1968 prefixes = feed.get("ipv6_prefixes", []) + feed.get("prefixes", [])
1969
1970 for prefix in prefixes:
1971 # Fetch network
1972 network = prefix.get("ipv6_prefix") or prefix.get("ip_prefix")
1973
1974 # Parse the network
1975 try:
1976 network = ipaddress.ip_network(network)
1977 except ValuleError as e:
1978 log.warning("%s: Unable to parse prefix %s" % (name, network))
1979 continue
1980
1981 # Sanitize parsed networks...
1982 if not self._check_parsed_network(network):
1983 continue
1984
1985 # Fetch the region
1986 region = prefix.get("region")
1987
1988 # Set some defaults
1989 cc = None
1990 is_anycast = False
1991
1992 # Fetch the CC from the dictionary
1993 try:
1994 cc = aws_region_country_map[region]
1995
1996 # If we couldn't find anything, let's try something else...
1997 except KeyError as e:
1998 # Find anycast networks
1999 if region == "GLOBAL":
2000 is_anycast = True
2001
2002 # Everything that starts with us- is probably in the United States
2003 elif region.startswith("us-"):
2004 cc = "US"
2005
2006 # Everything that starts with cn- is probably China
2007 elif region.startswith("cn-"):
2008 cc = "CN"
2009
2010 # Log a warning for anything else
2011 else:
2012 log.warning("%s: Could not determine country code for AWS region %s" \
2013 % (name, region))
2014 continue
2015
2016 # Write to database
2017 self.db.execute("""
2018 INSERT INTO
2019 network_feeds
2020 (
2021 network,
2022 source,
2023 country,
2024 is_anycast
2025 )
2026 VALUES
2027 (
2028 %s, %s, %s, %s
2029 )
2030 ON CONFLICT (network, source) DO NOTHING
2031 """, "%s" % network, name, cc, is_anycast,
2032 )
2033
2034 def _import_spamhaus_drop(self, name, f):
2035 """
2036 Import Spamhaus DROP IP feeds
2037 """
2038 # Count all lines
2039 lines = 0
2040
2041 # Walk through all lines
2042 for line in f:
2043 # Decode line
2044 line = line.decode("utf-8")
2045
2046 # Strip off any comments
2047 line, _, comment = line.partition(";")
2048
2049 # Ignore empty lines
2050 if not line:
2051 continue
2052
2053 # Strip any excess whitespace
2054 line = line.strip()
2055
2056 # Increment line counter
2057 lines += 1
2058
2059 # Parse the network
2060 try:
2061 network = ipaddress.ip_network(line)
2062 except ValueError as e:
2063 log.warning("%s: Could not parse network: %s - %s" % (name, line, e))
2064 continue
2065
2066 # Check network
2067 if not self._check_parsed_network(network):
2068 log.warning("%s: Skipping bogus network: %s" % (name, network))
2069 continue
2070
2071 # Insert into the database
2072 self.db.execute("""
2073 INSERT INTO
2074 network_feeds
2075 (
2076 network,
2077 source,
2078 is_drop
2079 )
2080 VALUES
2081 (
2082 %s, %s, %s
2083 )""", "%s" % network, name, True,
2084 )
2085
2086 # Raise an exception if we could not import anything
2087 if not lines:
2088 raise RuntimeError("Received bogus feed %s with no data" % name)
2089
2090 def _import_spamhaus_asndrop(self, name, f):
2091 """
2092 Import Spamhaus ASNDROP feed
2093 """
2094 for line in f:
2095 # Decode the line
2096 line = line.decode("utf-8")
2097
2098 # Parse JSON
2099 try:
2100 line = json.loads(line)
2101 except json.JSONDecodeError as e:
2102 log.warning("%s: Unable to parse JSON object %s: %s" % (name, line, e))
2103 continue
2104
2105 # Fetch type
2106 type = line.get("type")
2107
2108 # Skip any metadata
2109 if type == "metadata":
2110 continue
2111
2112 # Fetch ASN
2113 asn = line.get("asn")
2114
2115 # Skip any lines without an ASN
2116 if not asn:
2117 continue
2118
2119 # Filter invalid ASNs
2120 if not self._check_parsed_asn(asn):
2121 log.warning("%s: Skipping bogus ASN %s" % (name, asn))
2122 continue
2123
2124 # Write to database
2125 self.db.execute("""
2126 INSERT INTO
2127 autnum_feeds
2128 (
2129 number,
2130 source,
2131 is_drop
2132 )
2133 VALUES
2134 (
2135 %s, %s, %s
2136 )""", "%s" % asn, name, True,
2137 )
2138
2139 @staticmethod
2140 def _parse_bool(block, key):
2141 val = block.get(key)
2142
2143 # There is no point to proceed when we got None
2144 if val is None:
2145 return
2146
2147 # Convert to lowercase
2148 val = val.lower()
2149
2150 # True
2151 if val in ("yes", "1"):
2152 return True
2153
2154 # False
2155 if val in ("no", "0"):
2156 return False
2157
2158 # Default to None
2159 return None
2160
2161 def handle_import_countries(self, ns):
2162 with self.db.transaction():
2163 # Drop all data that we have
2164 self.db.execute("TRUNCATE TABLE countries")
2165
2166 for file in ns.file:
2167 for line in file:
2168 line = line.rstrip()
2169
2170 # Ignore any comments
2171 if line.startswith("#"):
2172 continue
2173
2174 try:
2175 country_code, continent_code, name = line.split(maxsplit=2)
2176 except:
2177 log.warning("Could not parse line: %s" % line)
2178 continue
2179
2180 self.db.execute("INSERT INTO countries(country_code, name, continent_code) \
2181 VALUES(%s, %s, %s) ON CONFLICT DO NOTHING", country_code, name, continent_code)
2182
2183
2184 def split_line(line):
2185 key, colon, val = line.partition(":")
2186
2187 # Strip any excess space
2188 key = key.strip()
2189 val = val.strip()
2190
2191 return key, val
2192
2193 def main():
2194 # Run the command line interface
2195 c = CLI()
2196 c.run()
2197
2198 main()