]> git.ipfire.org Git - location/libloc.git/blob - src/scripts/location-importer.in
importer: Change country code logic
[location/libloc.git] / src / scripts / location-importer.in
1 #!/usr/bin/python3
2 ###############################################################################
3 # #
4 # libloc - A library to determine the location of someone on the Internet #
5 # #
6 # Copyright (C) 2020-2024 IPFire Development Team <info@ipfire.org> #
7 # #
8 # This library is free software; you can redistribute it and/or #
9 # modify it under the terms of the GNU Lesser General Public #
10 # License as published by the Free Software Foundation; either #
11 # version 2.1 of the License, or (at your option) any later version. #
12 # #
13 # This library is distributed in the hope that it will be useful, #
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
16 # Lesser General Public License for more details. #
17 # #
18 ###############################################################################
19
20 import argparse
21 import concurrent.futures
22 import csv
23 import functools
24 import http.client
25 import ipaddress
26 import json
27 import logging
28 import math
29 import re
30 import socket
31 import sys
32 import urllib.error
33
34 # Load our location module
35 import location
36 import location.database
37 import location.importer
38 from location.i18n import _
39
40 # Initialise logging
41 log = logging.getLogger("location.importer")
42 log.propagate = 1
43
44 # Define constants
45 VALID_ASN_RANGES = (
46 (1, 23455),
47 (23457, 64495),
48 (131072, 4199999999),
49 )
50
51 TRANSLATED_COUNTRIES = {
52 # When people say UK, they mean GB
53 "UK" : "GB",
54 }
55
56 IGNORED_COUNTRIES = set((
57 # Formerly Yugoslavia
58 "YU",
59
60 # Some people use ZZ to say "no country" or to hide the country
61 "ZZ",
62 ))
63
64 # Configure the CSV parser for ARIN
65 csv.register_dialect("arin", delimiter=",", quoting=csv.QUOTE_ALL, quotechar="\"")
66
67 class CLI(object):
68 def parse_cli(self):
69 parser = argparse.ArgumentParser(
70 description=_("Location Importer Command Line Interface"),
71 )
72 subparsers = parser.add_subparsers()
73
74 # Global configuration flags
75 parser.add_argument("--debug", action="store_true",
76 help=_("Enable debug output"))
77 parser.add_argument("--quiet", action="store_true",
78 help=_("Enable quiet mode"))
79
80 # version
81 parser.add_argument("--version", action="version",
82 version="%(prog)s @VERSION@")
83
84 # Database
85 parser.add_argument("--database-host", required=True,
86 help=_("Database Hostname"), metavar=_("HOST"))
87 parser.add_argument("--database-name", required=True,
88 help=_("Database Name"), metavar=_("NAME"))
89 parser.add_argument("--database-username", required=True,
90 help=_("Database Username"), metavar=_("USERNAME"))
91 parser.add_argument("--database-password", required=True,
92 help=_("Database Password"), metavar=_("PASSWORD"))
93
94 # Write Database
95 write = subparsers.add_parser("write", help=_("Write database to file"))
96 write.set_defaults(func=self.handle_write)
97 write.add_argument("file", nargs=1, help=_("Database File"))
98 write.add_argument("--signing-key", nargs="?", type=open, help=_("Signing Key"))
99 write.add_argument("--backup-signing-key", nargs="?", type=open, help=_("Backup Signing Key"))
100 write.add_argument("--vendor", nargs="?", help=_("Sets the vendor"))
101 write.add_argument("--description", nargs="?", help=_("Sets a description"))
102 write.add_argument("--license", nargs="?", help=_("Sets the license"))
103 write.add_argument("--version", type=int, help=_("Database Format Version"))
104
105 # Update WHOIS
106 update_whois = subparsers.add_parser("update-whois", help=_("Update WHOIS Information"))
107 update_whois.set_defaults(func=self.handle_update_whois)
108
109 # Update announcements
110 update_announcements = subparsers.add_parser("update-announcements",
111 help=_("Update BGP Annoucements"))
112 update_announcements.set_defaults(func=self.handle_update_announcements)
113 update_announcements.add_argument("server", nargs=1,
114 help=_("Route Server to connect to"), metavar=_("SERVER"))
115
116 # Update geofeeds
117 update_geofeeds = subparsers.add_parser("update-geofeeds",
118 help=_("Update Geofeeds"))
119 update_geofeeds.set_defaults(func=self.handle_update_geofeeds)
120
121 # Update feeds
122 update_feeds = subparsers.add_parser("update-feeds",
123 help=_("Update Feeds"))
124 update_feeds.add_argument("feeds", nargs="*",
125 help=_("Only update these feeds"))
126 update_feeds.set_defaults(func=self.handle_update_feeds)
127
128 # Update overrides
129 update_overrides = subparsers.add_parser("update-overrides",
130 help=_("Update overrides"),
131 )
132 update_overrides.add_argument(
133 "files", nargs="+", help=_("Files to import"),
134 )
135 update_overrides.set_defaults(func=self.handle_update_overrides)
136
137 # Import countries
138 import_countries = subparsers.add_parser("import-countries",
139 help=_("Import countries"),
140 )
141 import_countries.add_argument("file", nargs=1, type=argparse.FileType("r"),
142 help=_("File to import"))
143 import_countries.set_defaults(func=self.handle_import_countries)
144
145 args = parser.parse_args()
146
147 # Configure logging
148 if args.debug:
149 location.logger.set_level(logging.DEBUG)
150 elif args.quiet:
151 location.logger.set_level(logging.WARNING)
152
153 # Print usage if no action was given
154 if not "func" in args:
155 parser.print_usage()
156 sys.exit(2)
157
158 return args
159
160 def run(self):
161 # Parse command line arguments
162 args = self.parse_cli()
163
164 # Initialise database
165 self.db = self._setup_database(args)
166
167 # Call function
168 ret = args.func(args)
169
170 # Return with exit code
171 if ret:
172 sys.exit(ret)
173
174 # Otherwise just exit
175 sys.exit(0)
176
177 def _setup_database(self, ns):
178 """
179 Initialise the database
180 """
181 # Connect to database
182 db = location.database.Connection(
183 host=ns.database_host, database=ns.database_name,
184 user=ns.database_username, password=ns.database_password,
185 )
186
187 with db.transaction():
188 db.execute("""
189 -- announcements
190 CREATE TABLE IF NOT EXISTS announcements(network inet, autnum bigint,
191 first_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
192 last_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP);
193 CREATE UNIQUE INDEX IF NOT EXISTS announcements_networks ON announcements(network);
194 CREATE INDEX IF NOT EXISTS announcements_family ON announcements(family(network));
195 CREATE INDEX IF NOT EXISTS announcements_search ON announcements USING GIST(network inet_ops);
196
197 -- autnums
198 CREATE TABLE IF NOT EXISTS autnums(number bigint, name text NOT NULL);
199 ALTER TABLE autnums ADD COLUMN IF NOT EXISTS source text;
200 CREATE UNIQUE INDEX IF NOT EXISTS autnums_number ON autnums(number);
201
202 -- countries
203 CREATE TABLE IF NOT EXISTS countries(
204 country_code text NOT NULL, name text NOT NULL, continent_code text NOT NULL);
205 CREATE UNIQUE INDEX IF NOT EXISTS countries_country_code ON countries(country_code);
206
207 -- networks
208 CREATE TABLE IF NOT EXISTS networks(network inet, country text);
209 ALTER TABLE networks ADD COLUMN IF NOT EXISTS original_countries text[];
210 ALTER TABLE networks ADD COLUMN IF NOT EXISTS source text;
211 CREATE UNIQUE INDEX IF NOT EXISTS networks_network ON networks(network);
212 CREATE INDEX IF NOT EXISTS networks_family ON networks USING BTREE(family(network));
213 CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
214
215 -- geofeeds
216 CREATE TABLE IF NOT EXISTS geofeeds(
217 id serial primary key,
218 url text,
219 status integer default null,
220 updated_at timestamp without time zone default null
221 );
222 ALTER TABLE geofeeds ADD COLUMN IF NOT EXISTS error text;
223 CREATE UNIQUE INDEX IF NOT EXISTS geofeeds_unique
224 ON geofeeds(url);
225 CREATE TABLE IF NOT EXISTS geofeed_networks(
226 geofeed_id integer references geofeeds(id) on delete cascade,
227 network inet,
228 country text,
229 region text,
230 city text
231 );
232 CREATE INDEX IF NOT EXISTS geofeed_networks_geofeed_id
233 ON geofeed_networks(geofeed_id);
234 CREATE INDEX IF NOT EXISTS geofeed_networks_search
235 ON geofeed_networks USING GIST(network inet_ops);
236 CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text);
237 CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique
238 ON network_geofeeds(network);
239 CREATE INDEX IF NOT EXISTS network_geofeeds_search
240 ON network_geofeeds USING GIST(network inet_ops);
241 CREATE INDEX IF NOT EXISTS network_geofeeds_url
242 ON network_geofeeds(url);
243
244 -- feeds
245 CREATE TABLE IF NOT EXISTS autnum_feeds(
246 number bigint NOT NULL,
247 source text NOT NULL,
248 name text,
249 country text,
250 is_anonymous_proxy boolean,
251 is_satellite_provider boolean,
252 is_anycast boolean,
253 is_drop boolean
254 );
255 CREATE UNIQUE INDEX IF NOT EXISTS autnum_feeds_unique
256 ON autnum_feeds(number, source);
257
258 CREATE TABLE IF NOT EXISTS network_feeds(
259 network inet NOT NULL,
260 source text NOT NULL,
261 country text,
262 is_anonymous_proxy boolean,
263 is_satellite_provider boolean,
264 is_anycast boolean,
265 is_drop boolean
266 );
267 CREATE UNIQUE INDEX IF NOT EXISTS network_feeds_unique
268 ON network_feeds(network, source);
269 CREATE INDEX IF NOT EXISTS network_feeds_search
270 ON network_feeds USING GIST(network inet_ops);
271
272 -- overrides
273 CREATE TABLE IF NOT EXISTS autnum_overrides(
274 number bigint NOT NULL,
275 name text,
276 country text,
277 is_anonymous_proxy boolean,
278 is_satellite_provider boolean,
279 is_anycast boolean
280 );
281 CREATE UNIQUE INDEX IF NOT EXISTS autnum_overrides_number
282 ON autnum_overrides(number);
283 ALTER TABLE autnum_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
284 ALTER TABLE autnum_overrides DROP COLUMN IF EXISTS source;
285
286 CREATE TABLE IF NOT EXISTS network_overrides(
287 network inet NOT NULL,
288 country text,
289 is_anonymous_proxy boolean,
290 is_satellite_provider boolean,
291 is_anycast boolean
292 );
293 CREATE UNIQUE INDEX IF NOT EXISTS network_overrides_network
294 ON network_overrides(network);
295 CREATE INDEX IF NOT EXISTS network_overrides_search
296 ON network_overrides USING GIST(network inet_ops);
297 ALTER TABLE network_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
298 ALTER TABLE network_overrides DROP COLUMN IF EXISTS source;
299 """)
300
301 return db
302
303 def fetch_countries(self):
304 """
305 Returns a list of all countries on the list
306 """
307 # Fetch all valid country codes to check parsed networks aganist...
308 countries = self.db.query("SELECT country_code FROM countries ORDER BY country_code")
309
310 return set((country.country_code for country in countries))
311
312 def handle_write(self, ns):
313 """
314 Compiles a database in libloc format out of what is in the database
315 """
316 # Allocate a writer
317 writer = location.Writer(ns.signing_key, ns.backup_signing_key)
318
319 # Set all metadata
320 if ns.vendor:
321 writer.vendor = ns.vendor
322
323 if ns.description:
324 writer.description = ns.description
325
326 if ns.license:
327 writer.license = ns.license
328
329 # Add all Autonomous Systems
330 log.info("Writing Autonomous Systems...")
331
332 # Select all ASes with a name
333 rows = self.db.query("""
334 SELECT
335 autnums.number AS number,
336 COALESCE(
337 overrides.name,
338 autnums.name
339 ) AS name
340 FROM
341 autnums
342 LEFT JOIN
343 autnum_overrides overrides ON autnums.number = overrides.number
344 ORDER BY
345 autnums.number
346 """)
347
348 for row in rows:
349 # Skip AS without names
350 if not row.name:
351 continue
352
353 a = writer.add_as(row.number)
354 a.name = row.name
355
356 # Add all networks
357 log.info("Writing networks...")
358
359 # Select all known networks
360 rows = self.db.query("""
361 WITH known_networks AS (
362 SELECT network FROM announcements
363 UNION
364 SELECT network FROM networks
365 UNION
366 SELECT network FROM network_feeds
367 UNION
368 SELECT network FROM network_overrides
369 UNION
370 SELECT network FROM geofeed_networks
371 ),
372
373 ordered_networks AS (
374 SELECT
375 known_networks.network AS network,
376 announcements.autnum AS autnum,
377 networks.country AS country,
378
379 -- Must be part of returned values for ORDER BY clause
380 masklen(announcements.network) AS sort_a,
381 masklen(networks.network) AS sort_b
382 FROM
383 known_networks
384 LEFT JOIN
385 announcements ON known_networks.network <<= announcements.network
386 LEFT JOIN
387 networks ON known_networks.network <<= networks.network
388 ORDER BY
389 known_networks.network,
390 sort_a DESC,
391 sort_b DESC
392 )
393
394 -- Return a list of those networks enriched with all
395 -- other information that we store in the database
396 SELECT
397 DISTINCT ON (network)
398 network,
399 autnum,
400
401 -- Country
402 COALESCE(
403 (
404 SELECT
405 country
406 FROM
407 network_overrides overrides
408 WHERE
409 networks.network <<= overrides.network
410 ORDER BY
411 masklen(overrides.network) DESC
412 LIMIT 1
413 ),
414 (
415 SELECT
416 country
417 FROM
418 autnum_overrides overrides
419 WHERE
420 networks.autnum = overrides.number
421 ),
422 (
423 SELECT
424 country
425 FROM
426 network_feeds feeds
427 WHERE
428 networks.network <<= feeds.network
429 ORDER BY
430 masklen(feeds.network) DESC
431 LIMIT 1
432 ),
433 (
434 SELECT
435 country
436 FROM
437 autnum_feeds feeds
438 WHERE
439 networks.autnum = feeds.number
440 ORDER BY
441 source
442 LIMIT 1
443 ),
444 (
445 SELECT
446 geofeed_networks.country AS country
447 FROM
448 network_geofeeds
449
450 -- Join the data from the geofeeds
451 LEFT JOIN
452 geofeeds ON network_geofeeds.url = geofeeds.url
453 LEFT JOIN
454 geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id
455
456 -- Check whether we have a geofeed for this network
457 WHERE
458 networks.network <<= network_geofeeds.network
459 AND
460 networks.network <<= geofeed_networks.network
461
462 -- Filter for the best result
463 ORDER BY
464 masklen(geofeed_networks.network) DESC
465 LIMIT 1
466 ),
467 networks.country
468 ) AS country,
469
470 -- Flags
471 COALESCE(
472 (
473 SELECT
474 is_anonymous_proxy
475 FROM
476 network_overrides overrides
477 WHERE
478 networks.network <<= overrides.network
479 ORDER BY
480 masklen(overrides.network) DESC
481 LIMIT 1
482 ),
483 (
484 SELECT
485 is_anonymous_proxy
486 FROM
487 network_feeds feeds
488 WHERE
489 networks.network <<= feeds.network
490 ORDER BY
491 masklen(feeds.network) DESC
492 LIMIT 1
493 ),
494 (
495 SELECT
496 is_anonymous_proxy
497 FROM
498 autnum_feeds feeds
499 WHERE
500 networks.autnum = feeds.number
501 ORDER BY
502 source
503 LIMIT 1
504 ),
505 (
506 SELECT
507 is_anonymous_proxy
508 FROM
509 autnum_overrides overrides
510 WHERE
511 networks.autnum = overrides.number
512 ),
513 FALSE
514 ) AS is_anonymous_proxy,
515 COALESCE(
516 (
517 SELECT
518 is_satellite_provider
519 FROM
520 network_overrides overrides
521 WHERE
522 networks.network <<= overrides.network
523 ORDER BY
524 masklen(overrides.network) DESC
525 LIMIT 1
526 ),
527 (
528 SELECT
529 is_satellite_provider
530 FROM
531 network_feeds feeds
532 WHERE
533 networks.network <<= feeds.network
534 ORDER BY
535 masklen(feeds.network) DESC
536 LIMIT 1
537 ),
538 (
539 SELECT
540 is_satellite_provider
541 FROM
542 autnum_feeds feeds
543 WHERE
544 networks.autnum = feeds.number
545 ORDER BY
546 source
547 LIMIT 1
548 ),
549 (
550 SELECT
551 is_satellite_provider
552 FROM
553 autnum_overrides overrides
554 WHERE
555 networks.autnum = overrides.number
556 ),
557 FALSE
558 ) AS is_satellite_provider,
559 COALESCE(
560 (
561 SELECT
562 is_anycast
563 FROM
564 network_overrides overrides
565 WHERE
566 networks.network <<= overrides.network
567 ORDER BY
568 masklen(overrides.network) DESC
569 LIMIT 1
570 ),
571 (
572 SELECT
573 is_anycast
574 FROM
575 network_feeds feeds
576 WHERE
577 networks.network <<= feeds.network
578 ORDER BY
579 masklen(feeds.network) DESC
580 LIMIT 1
581 ),
582 (
583 SELECT
584 is_anycast
585 FROM
586 autnum_feeds feeds
587 WHERE
588 networks.autnum = feeds.number
589 ORDER BY
590 source
591 LIMIT 1
592 ),
593 (
594 SELECT
595 is_anycast
596 FROM
597 autnum_overrides overrides
598 WHERE
599 networks.autnum = overrides.number
600 ),
601 FALSE
602 ) AS is_anycast,
603 COALESCE(
604 (
605 SELECT
606 is_drop
607 FROM
608 network_overrides overrides
609 WHERE
610 networks.network <<= overrides.network
611 ORDER BY
612 masklen(overrides.network) DESC
613 LIMIT 1
614 ),
615 (
616 SELECT
617 is_drop
618 FROM
619 network_feeds feeds
620 WHERE
621 networks.network <<= feeds.network
622 ORDER BY
623 masklen(feeds.network) DESC
624 LIMIT 1
625 ),
626 (
627 SELECT
628 is_drop
629 FROM
630 autnum_feeds feeds
631 WHERE
632 networks.autnum = feeds.number
633 ORDER BY
634 source
635 LIMIT 1
636 ),
637 (
638 SELECT
639 is_drop
640 FROM
641 autnum_overrides overrides
642 WHERE
643 networks.autnum = overrides.number
644 ),
645 FALSE
646 ) AS is_drop
647 FROM
648 ordered_networks networks
649 """)
650
651 for row in rows:
652 network = writer.add_network(row.network)
653
654 # Save country
655 if row.country:
656 network.country_code = row.country
657
658 # Save ASN
659 if row.autnum:
660 network.asn = row.autnum
661
662 # Set flags
663 if row.is_anonymous_proxy:
664 network.set_flag(location.NETWORK_FLAG_ANONYMOUS_PROXY)
665
666 if row.is_satellite_provider:
667 network.set_flag(location.NETWORK_FLAG_SATELLITE_PROVIDER)
668
669 if row.is_anycast:
670 network.set_flag(location.NETWORK_FLAG_ANYCAST)
671
672 if row.is_drop:
673 network.set_flag(location.NETWORK_FLAG_DROP)
674
675 # Add all countries
676 log.info("Writing countries...")
677 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
678
679 for row in rows:
680 c = writer.add_country(row.country_code)
681 c.continent_code = row.continent_code
682 c.name = row.name
683
684 # Write everything to file
685 log.info("Writing database to file...")
686 for file in ns.file:
687 writer.write(file)
688
689 def handle_update_whois(self, ns):
690 downloader = location.importer.Downloader()
691
692 # Did we run successfully?
693 error = False
694
695 # Fetch all valid country codes to check parsed networks against
696 countries = self.fetch_countries()
697
698 # Check if we have countries
699 if not countries:
700 log.error("Please import countries before importing any WHOIS data")
701 return 1
702
703 # Iterate over all potential sources
704 for source in sorted(location.importer.SOURCES):
705 with self.db.transaction():
706 # Create some temporary tables to store parsed data
707 self.db.execute("""
708 CREATE TEMPORARY TABLE _autnums(number integer NOT NULL,
709 organization text NOT NULL, source text NOT NULL) ON COMMIT DROP;
710 CREATE UNIQUE INDEX _autnums_number ON _autnums(number);
711
712 CREATE TEMPORARY TABLE _organizations(handle text NOT NULL,
713 name text NOT NULL, source text NOT NULL) ON COMMIT DROP;
714 CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle);
715
716 CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text,
717 original_countries text[] NOT NULL, source text NOT NULL)
718 ON COMMIT DROP;
719 CREATE INDEX _rirdata_search ON _rirdata
720 USING BTREE(family(network), masklen(network));
721 CREATE UNIQUE INDEX _rirdata_network ON _rirdata(network);
722 """)
723
724 # Remove all previously imported content
725 self.db.execute("DELETE FROM autnums WHERE source = %s", source)
726 self.db.execute("DELETE FROM networks WHERE source = %s", source)
727
728 try:
729 # Fetch WHOIS sources
730 for url in location.importer.WHOIS_SOURCES.get(source, []):
731 for block in downloader.request_blocks(url):
732 self._parse_block(block, source, countries)
733
734 # Fetch extended sources
735 for url in location.importer.EXTENDED_SOURCES.get(source, []):
736 for line in downloader.request_lines(url):
737 self._parse_line(line, source, countries)
738 except urllib.error.URLError as e:
739 log.error("Could not retrieve data from %s: %s" % (source, e))
740 error = True
741
742 # Continue with the next source
743 continue
744
745 # Process all parsed networks from every RIR we happen to have access to,
746 # insert the largest network chunks into the networks table immediately...
747 families = self.db.query("""
748 SELECT DISTINCT
749 family(network) AS family
750 FROM
751 _rirdata
752 ORDER BY
753 family(network)
754 """,
755 )
756
757 for family in (row.family for row in families):
758 # Fetch the smallest mask length in our data set
759 smallest = self.db.get("""
760 SELECT
761 MIN(
762 masklen(network)
763 ) AS prefix
764 FROM
765 _rirdata
766 WHERE
767 family(network) = %s
768 """, family,
769 )
770
771 # Copy all networks
772 self.db.execute("""
773 INSERT INTO
774 networks
775 (
776 network,
777 country,
778 original_countries,
779 source
780 )
781 SELECT
782 network,
783 country,
784 original_countries,
785 source
786 FROM
787 _rirdata
788 WHERE
789 masklen(network) = %s
790 AND
791 family(network) = %s
792 ON CONFLICT DO
793 NOTHING""",
794 smallest.prefix,
795 family,
796 )
797
798 # ... determine any other prefixes for this network family, ...
799 prefixes = self.db.query("""
800 SELECT
801 DISTINCT masklen(network) AS prefix
802 FROM
803 _rirdata
804 WHERE
805 family(network) = %s
806 ORDER BY
807 masklen(network) ASC
808 OFFSET 1
809 """, family,
810 )
811
812 # ... and insert networks with this prefix in case they provide additional
813 # information (i. e. subnet of a larger chunk with a different country)
814 for prefix in (row.prefix for row in prefixes):
815 self.db.execute("""
816 WITH candidates AS (
817 SELECT
818 _rirdata.network,
819 _rirdata.country,
820 _rirdata.original_countries,
821 _rirdata.source
822 FROM
823 _rirdata
824 WHERE
825 family(_rirdata.network) = %s
826 AND
827 masklen(_rirdata.network) = %s
828 ),
829 filtered AS (
830 SELECT
831 DISTINCT ON (c.network)
832 c.network,
833 c.country,
834 c.original_countries,
835 c.source,
836 masklen(networks.network),
837 networks.country AS parent_country
838 FROM
839 candidates c
840 LEFT JOIN
841 networks
842 ON
843 c.network << networks.network
844 ORDER BY
845 c.network,
846 masklen(networks.network) DESC NULLS LAST
847 )
848 INSERT INTO
849 networks(network, country, original_countries, source)
850 SELECT
851 network,
852 country,
853 original_countries,
854 source
855 FROM
856 filtered
857 WHERE
858 parent_country IS NULL
859 OR
860 country <> parent_country
861 ON CONFLICT DO NOTHING
862 """, family, prefix,
863 )
864
865 self.db.execute("""
866 INSERT INTO
867 autnums
868 (
869 number,
870 name,
871 source
872 )
873 SELECT
874 _autnums.number,
875 _organizations.name,
876 _organizations.source
877 FROM
878 _autnums
879 JOIN
880 _organizations ON _autnums.organization = _organizations.handle
881 ON CONFLICT
882 (
883 number
884 )
885 DO UPDATE
886 SET name = excluded.name
887 """,
888 )
889
890 # Download and import (technical) AS names from ARIN
891 with self.db.transaction():
892 self._import_as_names_from_arin(downloader)
893
894 # Return a non-zero exit code for errors
895 return 1 if error else 0
896
897 def _check_parsed_network(self, network):
898 """
899 Assistive function to detect and subsequently sort out parsed
900 networks from RIR data (both Whois and so-called "extended sources"),
901 which are or have...
902
903 (a) not globally routable (RFC 1918 space, et al.)
904 (b) covering a too large chunk of the IP address space (prefix length
905 is < 7 for IPv4 networks, and < 10 for IPv6)
906 (c) "0.0.0.0" or "::" as a network address
907 (d) are too small for being publicly announced (we have decided not to
908 process them at the moment, as they significantly enlarge our
909 database without providing very helpful additional information)
910
911 This unfortunately is necessary due to brain-dead clutter across
912 various RIR databases, causing mismatches and eventually disruptions.
913
914 We will return False in case a network is not suitable for adding
915 it to our database, and True otherwise.
916 """
917
918 if not network or not (isinstance(network, ipaddress.IPv4Network) or isinstance(network, ipaddress.IPv6Network)):
919 return False
920
921 if not network.is_global:
922 log.debug("Skipping non-globally routable network: %s" % network)
923 return False
924
925 if network.version == 4:
926 if network.prefixlen < 7:
927 log.debug("Skipping too big IP chunk: %s" % network)
928 return False
929
930 if network.prefixlen > 24:
931 log.debug("Skipping network too small to be publicly announced: %s" % network)
932 return False
933
934 if str(network.network_address) == "0.0.0.0":
935 log.debug("Skipping network based on 0.0.0.0: %s" % network)
936 return False
937
938 elif network.version == 6:
939 if network.prefixlen < 10:
940 log.debug("Skipping too big IP chunk: %s" % network)
941 return False
942
943 if network.prefixlen > 48:
944 log.debug("Skipping network too small to be publicly announced: %s" % network)
945 return False
946
947 if str(network.network_address) == "::":
948 log.debug("Skipping network based on '::': %s" % network)
949 return False
950
951 else:
952 # This should not happen...
953 log.warning("Skipping network of unknown family, this should not happen: %s" % network)
954 return False
955
956 # In case we have made it here, the network is considered to
957 # be suitable for libloc consumption...
958 return True
959
960 def _check_parsed_asn(self, asn):
961 """
962 Assistive function to filter Autonomous System Numbers not being suitable
963 for adding to our database. Returns False in such cases, and True otherwise.
964 """
965
966 for start, end in VALID_ASN_RANGES:
967 if start <= asn and end >= asn:
968 return True
969
970 log.info("Supplied ASN %s out of publicly routable ASN ranges" % asn)
971 return False
972
973 def _parse_block(self, block, source_key, countries):
974 # Get first line to find out what type of block this is
975 line = block[0]
976
977 # aut-num
978 if line.startswith("aut-num:"):
979 return self._parse_autnum_block(block, source_key)
980
981 # inetnum
982 if line.startswith("inet6num:") or line.startswith("inetnum:"):
983 return self._parse_inetnum_block(block, source_key, countries)
984
985 # organisation
986 elif line.startswith("organisation:"):
987 return self._parse_org_block(block, source_key)
988
989 def _parse_autnum_block(self, block, source_key):
990 autnum = {}
991 for line in block:
992 # Split line
993 key, val = split_line(line)
994
995 if key == "aut-num":
996 m = re.match(r"^(AS|as)(\d+)", val)
997 if m:
998 autnum["asn"] = m.group(2)
999
1000 elif key == "org":
1001 autnum[key] = val.upper()
1002
1003 elif key == "descr":
1004 # Save the first description line as well...
1005 if not key in autnum:
1006 autnum[key] = val
1007
1008 # Skip empty objects
1009 if not autnum or not "asn" in autnum:
1010 return
1011
1012 # Insert a dummy organisation handle into our temporary organisations
1013 # table in case the AS does not have an organisation handle set, but
1014 # has a description (a quirk often observed in APNIC area), so we can
1015 # later display at least some string for this AS.
1016 if not "org" in autnum:
1017 if "descr" in autnum:
1018 autnum["org"] = "LIBLOC-%s-ORGHANDLE" % autnum.get("asn")
1019
1020 self.db.execute("INSERT INTO _organizations(handle, name, source) \
1021 VALUES(%s, %s, %s) ON CONFLICT (handle) DO NOTHING",
1022 autnum.get("org"), autnum.get("descr"), source_key,
1023 )
1024 else:
1025 log.warning("ASN %s neither has an organisation handle nor a description line set, omitting" % \
1026 autnum.get("asn"))
1027 return
1028
1029 # Insert into database
1030 self.db.execute("INSERT INTO _autnums(number, organization, source) \
1031 VALUES(%s, %s, %s) ON CONFLICT (number) DO UPDATE SET \
1032 organization = excluded.organization",
1033 autnum.get("asn"), autnum.get("org"), source_key,
1034 )
1035
1036 def _parse_inetnum_block(self, block, source_key, countries):
1037 log.debug("Parsing inetnum block:")
1038
1039 inetnum = {}
1040 for line in block:
1041 log.debug(line)
1042
1043 # Split line
1044 key, val = split_line(line)
1045
1046 # Filter any inetnum records which are only referring to IP space
1047 # not managed by that specific RIR...
1048 if key == "netname":
1049 if re.match(r"^(ERX-NETBLOCK|(AFRINIC|ARIN|LACNIC|RIPE)-CIDR-BLOCK|IANA-NETBLOCK-\d{1,3}|NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK|STUB-[\d-]{3,}SLASH\d{1,2})", val.strip()):
1050 log.debug("Skipping record indicating historic/orphaned data: %s" % val.strip())
1051 return
1052
1053 if key == "inetnum":
1054 start_address, delim, end_address = val.partition("-")
1055
1056 # Strip any excess space
1057 start_address, end_address = start_address.rstrip(), end_address.strip()
1058
1059 # Handle "inetnum" formatting in LACNIC DB (e.g. "24.152.8/22" instead of "24.152.8.0/22")
1060 if start_address and not (delim or end_address):
1061 try:
1062 start_address = ipaddress.ip_network(start_address, strict=False)
1063 except ValueError:
1064 start_address = start_address.split("/")
1065 ldigits = start_address[0].count(".")
1066
1067 # How many octets do we need to add?
1068 # (LACNIC does not seem to have a /8 or greater assigned, so the following should suffice.)
1069 if ldigits == 1:
1070 start_address = start_address[0] + ".0.0/" + start_address[1]
1071 elif ldigits == 2:
1072 start_address = start_address[0] + ".0/" + start_address[1]
1073 else:
1074 log.warning("Could not recover IPv4 address from line in LACNIC DB format: %s" % line)
1075 return
1076
1077 try:
1078 start_address = ipaddress.ip_network(start_address, strict=False)
1079 except ValueError:
1080 log.warning("Could not parse line in LACNIC DB format: %s" % line)
1081 return
1082
1083 # Enumerate first and last IP address of this network
1084 end_address = start_address[-1]
1085 start_address = start_address[0]
1086
1087 else:
1088 # Convert to IP address
1089 try:
1090 start_address = ipaddress.ip_address(start_address)
1091 end_address = ipaddress.ip_address(end_address)
1092 except ValueError:
1093 log.warning("Could not parse line: %s" % line)
1094 return
1095
1096 inetnum["inetnum"] = list(ipaddress.summarize_address_range(start_address, end_address))
1097
1098 elif key == "inet6num":
1099 inetnum[key] = [ipaddress.ip_network(val, strict=False)]
1100
1101 elif key == "country":
1102 cc = val.upper()
1103
1104 # Ignore certain country codes
1105 if cc in IGNORED_COUNTRIES:
1106 log.debug("Ignoring country code '%s'" % cc)
1107 continue
1108
1109 # Translate country codes
1110 try:
1111 cc = TRANSLATED_COUNTRIES[cc]
1112 except KeyError:
1113 pass
1114
1115 # Do we know this country?
1116 if not cc in countries:
1117 log.warning("Skipping invalid country code '%s'" % cc)
1118 continue
1119
1120 try:
1121 inetnum[key].append(cc)
1122 except KeyError:
1123 inetnum[key] = [cc]
1124
1125 # Parse the geofeed attribute
1126 elif key == "geofeed":
1127 inetnum["geofeed"] = val
1128
1129 # Parse geofeed when used as a remark
1130 elif key == "remarks":
1131 m = re.match(r"^(?:Geofeed)\s+(https://.*)", val)
1132 if m:
1133 inetnum["geofeed"] = m.group(1)
1134
1135 # Skip empty objects
1136 if not inetnum:
1137 return
1138
1139 # Iterate through all networks enumerated from above, check them for plausibility and insert
1140 # them into the database, if _check_parsed_network() succeeded
1141 for single_network in inetnum.get("inet6num") or inetnum.get("inetnum"):
1142 if not self._check_parsed_network(single_network):
1143 continue
1144
1145 # Fetch the countries or use a list with an empty country
1146 countries = inetnum.get("country", [None])
1147
1148 # Insert the network into the database but only use the first country code
1149 for cc in countries:
1150 self.db.execute("""
1151 INSERT INTO
1152 _rirdata
1153 (
1154 network,
1155 country,
1156 original_countries,
1157 source
1158 )
1159 VALUES
1160 (
1161 %s, %s, %s, %s
1162 )
1163 ON CONFLICT (network)
1164 DO UPDATE SET country = excluded.country
1165 """, "%s" % single_network, cc, [cc for cc in countries if cc], source_key,
1166 )
1167
1168 # If there are more than one country, we will only use the first one
1169 break
1170
1171 # Update any geofeed information
1172 geofeed = inetnum.get("geofeed", None)
1173 if geofeed:
1174 self._parse_geofeed(geofeed, single_network)
1175
1176 # Delete any previous geofeeds
1177 else:
1178 self.db.execute(
1179 "DELETE FROM network_geofeeds WHERE network = %s", "%s" % single_network,
1180 )
1181
1182 def _parse_geofeed(self, url, single_network):
1183 # Parse the URL
1184 url = urllib.parse.urlparse(url)
1185
1186 # Make sure that this is a HTTPS URL
1187 if not url.scheme == "https":
1188 log.debug("Geofeed URL is not using HTTPS: %s" % geofeed)
1189 return
1190
1191 # Put the URL back together normalized
1192 url = url.geturl()
1193
1194 # Store/update any geofeeds
1195 self.db.execute("""
1196 INSERT INTO
1197 network_geofeeds(
1198 network,
1199 url
1200 )
1201 VALUES(
1202 %s, %s
1203 )
1204 ON CONFLICT (network) DO
1205 UPDATE SET url = excluded.url""",
1206 "%s" % single_network, url,
1207 )
1208
1209 def _parse_org_block(self, block, source_key):
1210 org = {}
1211 for line in block:
1212 # Split line
1213 key, val = split_line(line)
1214
1215 if key == "organisation":
1216 org[key] = val.upper()
1217 elif key == "org-name":
1218 org[key] = val
1219
1220 # Skip empty objects
1221 if not org:
1222 return
1223
1224 self.db.execute("INSERT INTO _organizations(handle, name, source) \
1225 VALUES(%s, %s, %s) ON CONFLICT (handle) DO \
1226 UPDATE SET name = excluded.name",
1227 org.get("organisation"), org.get("org-name"), source_key,
1228 )
1229
1230 def _parse_line(self, line, source_key, validcountries=None):
1231 # Skip version line
1232 if line.startswith("2"):
1233 return
1234
1235 # Skip comments
1236 if line.startswith("#"):
1237 return
1238
1239 try:
1240 registry, country_code, type, line = line.split("|", 3)
1241 except:
1242 log.warning("Could not parse line: %s" % line)
1243 return
1244
1245 # Skip any unknown protocols
1246 if not type in ("ipv6", "ipv4"):
1247 log.warning("Unknown IP protocol '%s'" % type)
1248 return
1249
1250 # Skip any lines that are for stats only or do not have a country
1251 # code at all (avoids log spam below)
1252 if not country_code or country_code == '*':
1253 return
1254
1255 # Skip objects with unknown country codes
1256 if validcountries and country_code not in validcountries:
1257 log.warning("Skipping line with bogus country '%s': %s" % \
1258 (country_code, line))
1259 return
1260
1261 try:
1262 address, prefix, date, status, organization = line.split("|")
1263 except ValueError:
1264 organization = None
1265
1266 # Try parsing the line without organization
1267 try:
1268 address, prefix, date, status = line.split("|")
1269 except ValueError:
1270 log.warning("Unhandled line format: %s" % line)
1271 return
1272
1273 # Skip anything that isn't properly assigned
1274 if not status in ("assigned", "allocated"):
1275 return
1276
1277 # Cast prefix into an integer
1278 try:
1279 prefix = int(prefix)
1280 except:
1281 log.warning("Invalid prefix: %s" % prefix)
1282 return
1283
1284 # Fix prefix length for IPv4
1285 if type == "ipv4":
1286 prefix = 32 - int(math.log(prefix, 2))
1287
1288 # Try to parse the address
1289 try:
1290 network = ipaddress.ip_network("%s/%s" % (address, prefix), strict=False)
1291 except ValueError:
1292 log.warning("Invalid IP address: %s" % address)
1293 return
1294
1295 if not self._check_parsed_network(network):
1296 return
1297
1298 self.db.execute("""
1299 INSERT INTO
1300 networks
1301 (
1302 network,
1303 country,
1304 original_countries,
1305 source
1306 )
1307 VALUES
1308 (
1309 %s, %s, %s, %s
1310 )
1311 ON CONFLICT (network)
1312 DO UPDATE SET country = excluded.country
1313 """, "%s" % network, country_code, [country], source_key,
1314 )
1315
1316 def _import_as_names_from_arin(self, downloader):
1317 # Delete all previously imported content
1318 self.db.execute("DELETE FROM autnums WHERE source = %s", "ARIN")
1319
1320 # Try to retrieve the feed from ftp.arin.net
1321 feed = downloader.request_lines("https://ftp.arin.net/pub/resource_registry_service/asns.csv")
1322
1323 # Walk through the file
1324 for line in csv.DictReader(feed, dialect="arin"):
1325 log.debug("Processing object: %s" % line)
1326
1327 # Fetch status
1328 status = line.get("Status")
1329
1330 # We are only interested in anything managed by ARIN
1331 if not status == "Full Registry Services":
1332 continue
1333
1334 # Fetch organization name
1335 name = line.get("Org Name")
1336
1337 # Extract ASNs
1338 first_asn = line.get("Start AS Number")
1339 last_asn = line.get("End AS Number")
1340
1341 # Cast to a number
1342 try:
1343 first_asn = int(first_asn)
1344 except TypeError as e:
1345 log.warning("Could not parse ASN '%s'" % first_asn)
1346 continue
1347
1348 try:
1349 last_asn = int(last_asn)
1350 except TypeError as e:
1351 log.warning("Could not parse ASN '%s'" % last_asn)
1352 continue
1353
1354 # Check if the range is valid
1355 if last_asn < first_asn:
1356 log.warning("Invalid ASN range %s-%s" % (first_asn, last_asn))
1357
1358 # Insert everything into the database
1359 for asn in range(first_asn, last_asn + 1):
1360 if not self._check_parsed_asn(asn):
1361 log.warning("Skipping invalid ASN %s" % asn)
1362 continue
1363
1364 self.db.execute("""
1365 INSERT INTO
1366 autnums
1367 (
1368 number,
1369 name,
1370 source
1371 )
1372 VALUES
1373 (
1374 %s, %s, %s
1375 )
1376 ON CONFLICT
1377 (
1378 number
1379 )
1380 DO NOTHING
1381 """, asn, name, "ARIN",
1382 )
1383
1384 def handle_update_announcements(self, ns):
1385 server = ns.server[0]
1386
1387 with self.db.transaction():
1388 if server.startswith("/"):
1389 self._handle_update_announcements_from_bird(server)
1390
1391 # Purge anything we never want here
1392 self.db.execute("""
1393 -- Delete default routes
1394 DELETE FROM announcements WHERE network = '::/0' OR network = '0.0.0.0/0';
1395
1396 -- Delete anything that is not global unicast address space
1397 DELETE FROM announcements WHERE family(network) = 6 AND NOT network <<= '2000::/3';
1398
1399 -- DELETE "current network" address space
1400 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '0.0.0.0/8';
1401
1402 -- DELETE local loopback address space
1403 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '127.0.0.0/8';
1404
1405 -- DELETE RFC 1918 address space
1406 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '10.0.0.0/8';
1407 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '172.16.0.0/12';
1408 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.168.0.0/16';
1409
1410 -- DELETE test, benchmark and documentation address space
1411 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.0.0/24';
1412 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.2.0/24';
1413 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.18.0.0/15';
1414 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.51.100.0/24';
1415 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '203.0.113.0/24';
1416
1417 -- DELETE CGNAT address space (RFC 6598)
1418 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '100.64.0.0/10';
1419
1420 -- DELETE link local address space
1421 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '169.254.0.0/16';
1422
1423 -- DELETE IPv6 to IPv4 (6to4) address space (RFC 3068)
1424 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.88.99.0/24';
1425 DELETE FROM announcements WHERE family(network) = 6 AND network <<= '2002::/16';
1426
1427 -- DELETE multicast and reserved address space
1428 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '224.0.0.0/4';
1429 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '240.0.0.0/4';
1430
1431 -- Delete networks that are too small to be in the global routing table
1432 DELETE FROM announcements WHERE family(network) = 6 AND masklen(network) > 48;
1433 DELETE FROM announcements WHERE family(network) = 4 AND masklen(network) > 24;
1434
1435 -- Delete any non-public or reserved ASNs
1436 DELETE FROM announcements WHERE NOT (
1437 (autnum >= 1 AND autnum <= 23455)
1438 OR
1439 (autnum >= 23457 AND autnum <= 64495)
1440 OR
1441 (autnum >= 131072 AND autnum <= 4199999999)
1442 );
1443
1444 -- Delete everything that we have not seen for 14 days
1445 DELETE FROM announcements WHERE last_seen_at <= CURRENT_TIMESTAMP - INTERVAL '14 days';
1446 """)
1447
1448 def _handle_update_announcements_from_bird(self, server):
1449 # Pre-compile the regular expression for faster searching
1450 route = re.compile(b"^\s(.+?)\s+.+?\[(?:AS(.*?))?.\]$")
1451
1452 log.info("Requesting routing table from Bird (%s)" % server)
1453
1454 aggregated_networks = []
1455
1456 # Send command to list all routes
1457 for line in self._bird_cmd(server, "show route"):
1458 m = route.match(line)
1459 if not m:
1460 # Skip empty lines
1461 if not line:
1462 pass
1463
1464 # Ignore any header lines with the name of the routing table
1465 elif line.startswith(b"Table"):
1466 pass
1467
1468 # Log anything else
1469 else:
1470 log.debug("Could not parse line: %s" % line.decode())
1471
1472 continue
1473
1474 # Fetch the extracted network and ASN
1475 network, autnum = m.groups()
1476
1477 # Decode into strings
1478 if network:
1479 network = network.decode()
1480 if autnum:
1481 autnum = autnum.decode()
1482
1483 # Collect all aggregated networks
1484 if not autnum:
1485 log.debug("%s is an aggregated network" % network)
1486 aggregated_networks.append(network)
1487 continue
1488
1489 # Insert it into the database
1490 self.db.execute("INSERT INTO announcements(network, autnum) \
1491 VALUES(%s, %s) ON CONFLICT (network) DO \
1492 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1493 network, autnum,
1494 )
1495
1496 # Process any aggregated networks
1497 for network in aggregated_networks:
1498 log.debug("Processing aggregated network %s" % network)
1499
1500 # Run "show route all" for each network
1501 for line in self._bird_cmd(server, "show route %s all" % network):
1502 # Try finding the path
1503 m = re.match(b"\s+BGP\.as_path:.* (\d+) {\d+}$", line)
1504 if m:
1505 # Select the last AS number in the path
1506 autnum = m.group(1).decode()
1507
1508 # Insert it into the database
1509 self.db.execute("INSERT INTO announcements(network, autnum) \
1510 VALUES(%s, %s) ON CONFLICT (network) DO \
1511 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1512 network, autnum,
1513 )
1514
1515 # We don't need to process any more
1516 break
1517
1518 def _bird_cmd(self, socket_path, command):
1519 # Connect to the socket
1520 s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
1521 s.connect(socket_path)
1522
1523 # Allocate some buffer
1524 buffer = b""
1525
1526 log.debug("Sending Bird command: %s" % command)
1527
1528 # Send the command
1529 s.send(b"%s\n" % command.encode())
1530
1531 while True:
1532 # Fill up the buffer
1533 buffer += s.recv(4096)
1534
1535 while True:
1536 # Search for the next newline
1537 pos = buffer.find(b"\n")
1538
1539 # If we cannot find one, we go back and read more data
1540 if pos <= 0:
1541 break
1542
1543 # Cut after the newline character
1544 pos += 1
1545
1546 # Split the line we want and keep the rest in buffer
1547 line, buffer = buffer[:pos], buffer[pos:]
1548
1549 # Try parsing any status lines
1550 if len(line) > 4 and line[:4].isdigit() and line[4] in (32, 45):
1551 code, delim, line = int(line[:4]), line[4], line[5:]
1552
1553 log.debug("Received response code %s from bird" % code)
1554
1555 # End of output
1556 if code == 0:
1557 return
1558
1559 # Ignore hello line
1560 elif code == 1:
1561 continue
1562
1563 # Otherwise return the line
1564 yield line
1565
1566 def handle_update_geofeeds(self, ns):
1567 # Sync geofeeds
1568 with self.db.transaction():
1569 # Delete all geofeeds which are no longer linked
1570 self.db.execute("""
1571 DELETE FROM
1572 geofeeds
1573 WHERE
1574 NOT EXISTS (
1575 SELECT
1576 1
1577 FROM
1578 network_geofeeds
1579 WHERE
1580 geofeeds.url = network_geofeeds.url
1581 )""",
1582 )
1583
1584 # Copy all geofeeds
1585 self.db.execute("""
1586 INSERT INTO
1587 geofeeds(
1588 url
1589 )
1590 SELECT
1591 url
1592 FROM
1593 network_geofeeds
1594 ON CONFLICT (url)
1595 DO NOTHING
1596 """,
1597 )
1598
1599 # Fetch all Geofeeds that require an update
1600 geofeeds = self.db.query("""
1601 SELECT
1602 id,
1603 url
1604 FROM
1605 geofeeds
1606 WHERE
1607 updated_at IS NULL
1608 OR
1609 updated_at <= CURRENT_TIMESTAMP - INTERVAL '1 week'
1610 ORDER BY
1611 id
1612 """)
1613
1614 # Create a downloader
1615 downloader = location.importer.Downloader()
1616
1617 # Pass the downloader to the fetch_geofeed function
1618 fetch_geofeed = functools.partial(self._fetch_geofeed, downloader)
1619
1620 with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
1621 results = executor.map(fetch_geofeed, geofeeds)
1622
1623 # Fetch all results to raise any exceptions
1624 for result in results:
1625 pass
1626
1627 # Delete data from any feeds that did not update in the last two weeks
1628 with self.db.transaction():
1629 self.db.execute("""
1630 DELETE FROM
1631 geofeed_networks
1632 WHERE
1633 geofeed_networks.geofeed_id IN (
1634 SELECT
1635 geofeeds.id
1636 FROM
1637 geofeeds
1638 WHERE
1639 updated_at IS NULL
1640 OR
1641 updated_at <= CURRENT_TIMESTAMP - INTERVAL '2 weeks'
1642 )
1643 """)
1644
1645 def _fetch_geofeed(self, downloader, geofeed):
1646 log.debug("Fetching Geofeed %s" % geofeed.url)
1647
1648 with self.db.transaction():
1649 # Open the URL
1650 try:
1651 # Send the request
1652 f = downloader.retrieve(geofeed.url, headers={
1653 "User-Agent" : "location/%s" % location.__version__,
1654
1655 # We expect some plain text file in CSV format
1656 "Accept" : "text/csv, text/plain",
1657 })
1658
1659 # Remove any previous data
1660 self.db.execute("DELETE FROM geofeed_networks \
1661 WHERE geofeed_id = %s", geofeed.id)
1662
1663 lineno = 0
1664
1665 # Read the output line by line
1666 for line in f:
1667 lineno += 1
1668
1669 try:
1670 line = line.decode()
1671
1672 # Ignore any lines we cannot decode
1673 except UnicodeDecodeError:
1674 log.debug("Could not decode line %s in %s" \
1675 % (lineno, geofeed.url))
1676 continue
1677
1678 # Strip any newline
1679 line = line.rstrip()
1680
1681 # Skip empty lines
1682 if not line:
1683 continue
1684
1685 # Try to parse the line
1686 try:
1687 fields = line.split(",", 5)
1688 except ValueError:
1689 log.debug("Could not parse line: %s" % line)
1690 continue
1691
1692 # Check if we have enough fields
1693 if len(fields) < 4:
1694 log.debug("Not enough fields in line: %s" % line)
1695 continue
1696
1697 # Fetch all fields
1698 network, country, region, city, = fields[:4]
1699
1700 # Try to parse the network
1701 try:
1702 network = ipaddress.ip_network(network, strict=False)
1703 except ValueError:
1704 log.debug("Could not parse network: %s" % network)
1705 continue
1706
1707 # Strip any excess whitespace from country codes
1708 country = country.strip()
1709
1710 # Make the country code uppercase
1711 country = country.upper()
1712
1713 # Check the country code
1714 if not country:
1715 log.debug("Empty country code in Geofeed %s line %s" \
1716 % (geofeed.url, lineno))
1717 continue
1718
1719 elif not location.country_code_is_valid(country):
1720 log.debug("Invalid country code in Geofeed %s:%s: %s" \
1721 % (geofeed.url, lineno, country))
1722 continue
1723
1724 # Write this into the database
1725 self.db.execute("""
1726 INSERT INTO
1727 geofeed_networks (
1728 geofeed_id,
1729 network,
1730 country,
1731 region,
1732 city
1733 )
1734 VALUES (%s, %s, %s, %s, %s)""",
1735 geofeed.id,
1736 "%s" % network,
1737 country,
1738 region,
1739 city,
1740 )
1741
1742 # Catch any HTTP errors
1743 except urllib.request.HTTPError as e:
1744 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1745 WHERE id = %s", e.code, "%s" % e, geofeed.id)
1746
1747 # Remove any previous data when the feed has been deleted
1748 if e.code == 404:
1749 self.db.execute("DELETE FROM geofeed_networks \
1750 WHERE geofeed_id = %s", geofeed.id)
1751
1752 # Catch any other errors and connection timeouts
1753 except (http.client.InvalidURL, urllib.request.URLError, TimeoutError) as e:
1754 log.debug("Could not fetch URL %s: %s" % (geofeed.url, e))
1755
1756 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1757 WHERE id = %s", 599, "%s" % e, geofeed.id)
1758
1759 # Mark the geofeed as updated
1760 else:
1761 self.db.execute("""
1762 UPDATE
1763 geofeeds
1764 SET
1765 updated_at = CURRENT_TIMESTAMP,
1766 status = NULL,
1767 error = NULL
1768 WHERE
1769 id = %s""",
1770 geofeed.id,
1771 )
1772
1773 def handle_update_overrides(self, ns):
1774 with self.db.transaction():
1775 # Drop any previous content
1776 self.db.execute("TRUNCATE TABLE autnum_overrides")
1777 self.db.execute("TRUNCATE TABLE network_overrides")
1778
1779 for file in ns.files:
1780 log.info("Reading %s..." % file)
1781
1782 with open(file, "rb") as f:
1783 for type, block in location.importer.read_blocks(f):
1784 if type == "net":
1785 network = block.get("net")
1786 # Try to parse and normalise the network
1787 try:
1788 network = ipaddress.ip_network(network, strict=False)
1789 except ValueError as e:
1790 log.warning("Invalid IP network: %s: %s" % (network, e))
1791 continue
1792
1793 # Prevent that we overwrite all networks
1794 if network.prefixlen == 0:
1795 log.warning("Skipping %s: You cannot overwrite default" % network)
1796 continue
1797
1798 self.db.execute("""
1799 INSERT INTO
1800 network_overrides
1801 (
1802 network,
1803 country,
1804 is_anonymous_proxy,
1805 is_satellite_provider,
1806 is_anycast,
1807 is_drop
1808 )
1809 VALUES
1810 (
1811 %s, %s, %s, %s, %s, %s
1812 )
1813 ON CONFLICT (network) DO NOTHING
1814 """,
1815 "%s" % network,
1816 block.get("country"),
1817 self._parse_bool(block, "is-anonymous-proxy"),
1818 self._parse_bool(block, "is-satellite-provider"),
1819 self._parse_bool(block, "is-anycast"),
1820 self._parse_bool(block, "drop"),
1821 )
1822
1823 elif type == "aut-num":
1824 autnum = block.get("aut-num")
1825
1826 # Check if AS number begins with "AS"
1827 if not autnum.startswith("AS"):
1828 log.warning("Invalid AS number: %s" % autnum)
1829 continue
1830
1831 # Strip "AS"
1832 autnum = autnum[2:]
1833
1834 self.db.execute("""
1835 INSERT INTO
1836 autnum_overrides
1837 (
1838 number,
1839 name,
1840 country,
1841 is_anonymous_proxy,
1842 is_satellite_provider,
1843 is_anycast,
1844 is_drop
1845 )
1846 VALUES
1847 (
1848 %s, %s, %s, %s, %s, %s, %s
1849 )
1850 ON CONFLICT (number) DO NOTHING
1851 """,
1852 autnum,
1853 block.get("name"),
1854 block.get("country"),
1855 self._parse_bool(block, "is-anonymous-proxy"),
1856 self._parse_bool(block, "is-satellite-provider"),
1857 self._parse_bool(block, "is-anycast"),
1858 self._parse_bool(block, "drop"),
1859 )
1860
1861 else:
1862 log.warning("Unsupported type: %s" % type)
1863
1864 def handle_update_feeds(self, ns):
1865 """
1866 Update any third-party feeds
1867 """
1868 success = True
1869
1870 # Create a downloader
1871 downloader = location.importer.Downloader()
1872
1873 feeds = (
1874 # AWS IP Ranges
1875 ("AWS-IP-RANGES", self._import_aws_ip_ranges, "https://ip-ranges.amazonaws.com/ip-ranges.json"),
1876
1877 # Spamhaus DROP
1878 ("SPAMHAUS-DROP", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/drop.txt"),
1879 ("SPAMHAUS-EDROP", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/edrop.txt"),
1880 ("SPAMHAUS-DROPV6", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/dropv6.txt"),
1881
1882 # Spamhaus ASNDROP
1883 ("SPAMHAUS-ASNDROP", self._import_spamhaus_asndrop, "https://www.spamhaus.org/drop/asndrop.json"),
1884 )
1885
1886 # Drop any data from feeds that we don't support (any more)
1887 with self.db.transaction():
1888 # Fetch the names of all feeds we support
1889 sources = [name for name, *rest in feeds]
1890
1891 self.db.execute("DELETE FROM autnum_feeds WHERE NOT source = ANY(%s)", sources)
1892 self.db.execute("DELETE FROM network_feeds WHERE NOT source = ANY(%s)", sources)
1893
1894 # Walk through all feeds
1895 for name, callback, url, *args in feeds:
1896 # Skip any feeds that were not requested on the command line
1897 if ns.feeds and not name in ns.feeds:
1898 continue
1899
1900 try:
1901 self._process_feed(downloader, name, callback, url, *args)
1902
1903 # Log an error but continue if an exception occurs
1904 except Exception as e:
1905 log.error("Error processing feed '%s': %s" % (name, e))
1906 success = False
1907
1908 # Return status
1909 return 0 if success else 1
1910
1911 def _process_feed(self, downloader, name, callback, url, *args):
1912 """
1913 Processes one feed
1914 """
1915 # Open the URL
1916 f = downloader.retrieve(url)
1917
1918 with self.db.transaction():
1919 # Drop any previous content
1920 self.db.execute("DELETE FROM autnum_feeds WHERE source = %s", name)
1921 self.db.execute("DELETE FROM network_feeds WHERE source = %s", name)
1922
1923 # Call the callback to process the feed
1924 return callback(name, f, *args)
1925
1926 def _import_aws_ip_ranges(self, name, f):
1927 # Parse the feed
1928 feed = json.load(f)
1929
1930 # Set up a dictionary for mapping a region name to a country. Unfortunately,
1931 # there seems to be no machine-readable version available of this other than
1932 # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html
1933 # (worse, it seems to be incomplete :-/ ); https://www.cloudping.cloud/endpoints
1934 # was helpful here as well.
1935 aws_region_country_map = {
1936 # Africa
1937 "af-south-1" : "ZA",
1938
1939 # Asia
1940 "il-central-1" : "IL", # Tel Aviv
1941
1942 # Asia/Pacific
1943 "ap-northeast-1" : "JP",
1944 "ap-northeast-2" : "KR",
1945 "ap-northeast-3" : "JP",
1946 "ap-east-1" : "HK",
1947 "ap-south-1" : "IN",
1948 "ap-south-2" : "IN",
1949 "ap-southeast-1" : "SG",
1950 "ap-southeast-2" : "AU",
1951 "ap-southeast-3" : "MY",
1952 "ap-southeast-4" : "AU",
1953 "ap-southeast-5" : "NZ", # Auckland, NZ
1954 "ap-southeast-6" : "AP", # XXX: Precise location not documented anywhere
1955
1956 # Canada
1957 "ca-central-1" : "CA",
1958 "ca-west-1" : "CA",
1959
1960 # Europe
1961 "eu-central-1" : "DE",
1962 "eu-central-2" : "CH",
1963 "eu-north-1" : "SE",
1964 "eu-west-1" : "IE",
1965 "eu-west-2" : "GB",
1966 "eu-west-3" : "FR",
1967 "eu-south-1" : "IT",
1968 "eu-south-2" : "ES",
1969
1970 # Middle East
1971 "me-central-1" : "AE",
1972 "me-south-1" : "BH",
1973
1974 # South America
1975 "sa-east-1" : "BR",
1976
1977 # Undocumented, likely located in Berlin rather than Frankfurt
1978 "eusc-de-east-1" : "DE",
1979 }
1980
1981 # Collect a list of all networks
1982 prefixes = feed.get("ipv6_prefixes", []) + feed.get("prefixes", [])
1983
1984 for prefix in prefixes:
1985 # Fetch network
1986 network = prefix.get("ipv6_prefix") or prefix.get("ip_prefix")
1987
1988 # Parse the network
1989 try:
1990 network = ipaddress.ip_network(network)
1991 except ValuleError as e:
1992 log.warning("%s: Unable to parse prefix %s" % (name, network))
1993 continue
1994
1995 # Sanitize parsed networks...
1996 if not self._check_parsed_network(network):
1997 continue
1998
1999 # Fetch the region
2000 region = prefix.get("region")
2001
2002 # Set some defaults
2003 cc = None
2004 is_anycast = False
2005
2006 # Fetch the CC from the dictionary
2007 try:
2008 cc = aws_region_country_map[region]
2009
2010 # If we couldn't find anything, let's try something else...
2011 except KeyError as e:
2012 # Find anycast networks
2013 if region == "GLOBAL":
2014 is_anycast = True
2015
2016 # Everything that starts with us- is probably in the United States
2017 elif region.startswith("us-"):
2018 cc = "US"
2019
2020 # Everything that starts with cn- is probably China
2021 elif region.startswith("cn-"):
2022 cc = "CN"
2023
2024 # Log a warning for anything else
2025 else:
2026 log.warning("%s: Could not determine country code for AWS region %s" \
2027 % (name, region))
2028 continue
2029
2030 # Write to database
2031 self.db.execute("""
2032 INSERT INTO
2033 network_feeds
2034 (
2035 network,
2036 source,
2037 country,
2038 is_anycast
2039 )
2040 VALUES
2041 (
2042 %s, %s, %s, %s
2043 )
2044 ON CONFLICT (network, source) DO NOTHING
2045 """, "%s" % network, name, cc, is_anycast,
2046 )
2047
2048 def _import_spamhaus_drop(self, name, f):
2049 """
2050 Import Spamhaus DROP IP feeds
2051 """
2052 # Count all lines
2053 lines = 0
2054
2055 # Walk through all lines
2056 for line in f:
2057 # Decode line
2058 line = line.decode("utf-8")
2059
2060 # Strip off any comments
2061 line, _, comment = line.partition(";")
2062
2063 # Ignore empty lines
2064 if not line:
2065 continue
2066
2067 # Strip any excess whitespace
2068 line = line.strip()
2069
2070 # Increment line counter
2071 lines += 1
2072
2073 # Parse the network
2074 try:
2075 network = ipaddress.ip_network(line)
2076 except ValueError as e:
2077 log.warning("%s: Could not parse network: %s - %s" % (name, line, e))
2078 continue
2079
2080 # Check network
2081 if not self._check_parsed_network(network):
2082 log.warning("%s: Skipping bogus network: %s" % (name, network))
2083 continue
2084
2085 # Insert into the database
2086 self.db.execute("""
2087 INSERT INTO
2088 network_feeds
2089 (
2090 network,
2091 source,
2092 is_drop
2093 )
2094 VALUES
2095 (
2096 %s, %s, %s
2097 )""", "%s" % network, name, True,
2098 )
2099
2100 # Raise an exception if we could not import anything
2101 if not lines:
2102 raise RuntimeError("Received bogus feed %s with no data" % name)
2103
2104 def _import_spamhaus_asndrop(self, name, f):
2105 """
2106 Import Spamhaus ASNDROP feed
2107 """
2108 for line in f:
2109 # Decode the line
2110 line = line.decode("utf-8")
2111
2112 # Parse JSON
2113 try:
2114 line = json.loads(line)
2115 except json.JSONDecodeError as e:
2116 log.warning("%s: Unable to parse JSON object %s: %s" % (name, line, e))
2117 continue
2118
2119 # Fetch type
2120 type = line.get("type")
2121
2122 # Skip any metadata
2123 if type == "metadata":
2124 continue
2125
2126 # Fetch ASN
2127 asn = line.get("asn")
2128
2129 # Skip any lines without an ASN
2130 if not asn:
2131 continue
2132
2133 # Filter invalid ASNs
2134 if not self._check_parsed_asn(asn):
2135 log.warning("%s: Skipping bogus ASN %s" % (name, asn))
2136 continue
2137
2138 # Write to database
2139 self.db.execute("""
2140 INSERT INTO
2141 autnum_feeds
2142 (
2143 number,
2144 source,
2145 is_drop
2146 )
2147 VALUES
2148 (
2149 %s, %s, %s
2150 )""", "%s" % asn, name, True,
2151 )
2152
2153 @staticmethod
2154 def _parse_bool(block, key):
2155 val = block.get(key)
2156
2157 # There is no point to proceed when we got None
2158 if val is None:
2159 return
2160
2161 # Convert to lowercase
2162 val = val.lower()
2163
2164 # True
2165 if val in ("yes", "1"):
2166 return True
2167
2168 # False
2169 if val in ("no", "0"):
2170 return False
2171
2172 # Default to None
2173 return None
2174
2175 def handle_import_countries(self, ns):
2176 with self.db.transaction():
2177 # Drop all data that we have
2178 self.db.execute("TRUNCATE TABLE countries")
2179
2180 for file in ns.file:
2181 for line in file:
2182 line = line.rstrip()
2183
2184 # Ignore any comments
2185 if line.startswith("#"):
2186 continue
2187
2188 try:
2189 country_code, continent_code, name = line.split(maxsplit=2)
2190 except:
2191 log.warning("Could not parse line: %s" % line)
2192 continue
2193
2194 self.db.execute("INSERT INTO countries(country_code, name, continent_code) \
2195 VALUES(%s, %s, %s) ON CONFLICT DO NOTHING", country_code, name, continent_code)
2196
2197
2198 def split_line(line):
2199 key, colon, val = line.partition(":")
2200
2201 # Strip any excess space
2202 key = key.strip()
2203 val = val.strip()
2204
2205 return key, val
2206
2207 def main():
2208 # Run the command line interface
2209 c = CLI()
2210 c.run()
2211
2212 main()