]> git.ipfire.org Git - people/ms/libloc.git/blob - src/scripts/location-importer.in
importer: Refactor feed parsing
[people/ms/libloc.git] / src / scripts / location-importer.in
1 #!/usr/bin/python3
2 ###############################################################################
3 # #
4 # libloc - A library to determine the location of someone on the Internet #
5 # #
6 # Copyright (C) 2020-2024 IPFire Development Team <info@ipfire.org> #
7 # #
8 # This library is free software; you can redistribute it and/or #
9 # modify it under the terms of the GNU Lesser General Public #
10 # License as published by the Free Software Foundation; either #
11 # version 2.1 of the License, or (at your option) any later version. #
12 # #
13 # This library is distributed in the hope that it will be useful, #
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
16 # Lesser General Public License for more details. #
17 # #
18 ###############################################################################
19
20 import argparse
21 import concurrent.futures
22 import csv
23 import http.client
24 import ipaddress
25 import json
26 import logging
27 import math
28 import re
29 import socket
30 import sys
31 import urllib.error
32
33 # Load our location module
34 import location
35 import location.database
36 import location.importer
37 from location.i18n import _
38
39 # Initialise logging
40 log = logging.getLogger("location.importer")
41 log.propagate = 1
42
43 # Define constants
44 VALID_ASN_RANGES = (
45 (1, 23455),
46 (23457, 64495),
47 (131072, 4199999999),
48 )
49
50 # Configure the CSV parser for ARIN
51 csv.register_dialect("arin", delimiter=",", quoting=csv.QUOTE_ALL, quotechar="\"")
52
53 class CLI(object):
54 def parse_cli(self):
55 parser = argparse.ArgumentParser(
56 description=_("Location Importer Command Line Interface"),
57 )
58 subparsers = parser.add_subparsers()
59
60 # Global configuration flags
61 parser.add_argument("--debug", action="store_true",
62 help=_("Enable debug output"))
63 parser.add_argument("--quiet", action="store_true",
64 help=_("Enable quiet mode"))
65
66 # version
67 parser.add_argument("--version", action="version",
68 version="%(prog)s @VERSION@")
69
70 # Database
71 parser.add_argument("--database-host", required=True,
72 help=_("Database Hostname"), metavar=_("HOST"))
73 parser.add_argument("--database-name", required=True,
74 help=_("Database Name"), metavar=_("NAME"))
75 parser.add_argument("--database-username", required=True,
76 help=_("Database Username"), metavar=_("USERNAME"))
77 parser.add_argument("--database-password", required=True,
78 help=_("Database Password"), metavar=_("PASSWORD"))
79
80 # Write Database
81 write = subparsers.add_parser("write", help=_("Write database to file"))
82 write.set_defaults(func=self.handle_write)
83 write.add_argument("file", nargs=1, help=_("Database File"))
84 write.add_argument("--signing-key", nargs="?", type=open, help=_("Signing Key"))
85 write.add_argument("--backup-signing-key", nargs="?", type=open, help=_("Backup Signing Key"))
86 write.add_argument("--vendor", nargs="?", help=_("Sets the vendor"))
87 write.add_argument("--description", nargs="?", help=_("Sets a description"))
88 write.add_argument("--license", nargs="?", help=_("Sets the license"))
89 write.add_argument("--version", type=int, help=_("Database Format Version"))
90
91 # Update WHOIS
92 update_whois = subparsers.add_parser("update-whois", help=_("Update WHOIS Information"))
93 update_whois.set_defaults(func=self.handle_update_whois)
94
95 # Update announcements
96 update_announcements = subparsers.add_parser("update-announcements",
97 help=_("Update BGP Annoucements"))
98 update_announcements.set_defaults(func=self.handle_update_announcements)
99 update_announcements.add_argument("server", nargs=1,
100 help=_("Route Server to connect to"), metavar=_("SERVER"))
101
102 # Update geofeeds
103 update_geofeeds = subparsers.add_parser("update-geofeeds",
104 help=_("Update Geofeeds"))
105 update_geofeeds.set_defaults(func=self.handle_update_geofeeds)
106
107 # Update feeds
108 update_feeds = subparsers.add_parser("update-feeds",
109 help=_("Update Feeds"))
110 update_feeds.set_defaults(func=self.handle_update_feeds)
111
112 # Update overrides
113 update_overrides = subparsers.add_parser("update-overrides",
114 help=_("Update overrides"),
115 )
116 update_overrides.add_argument(
117 "files", nargs="+", help=_("Files to import"),
118 )
119 update_overrides.set_defaults(func=self.handle_update_overrides)
120
121 # Import countries
122 import_countries = subparsers.add_parser("import-countries",
123 help=_("Import countries"),
124 )
125 import_countries.add_argument("file", nargs=1, type=argparse.FileType("r"),
126 help=_("File to import"))
127 import_countries.set_defaults(func=self.handle_import_countries)
128
129 args = parser.parse_args()
130
131 # Configure logging
132 if args.debug:
133 location.logger.set_level(logging.DEBUG)
134 elif args.quiet:
135 location.logger.set_level(logging.WARNING)
136
137 # Print usage if no action was given
138 if not "func" in args:
139 parser.print_usage()
140 sys.exit(2)
141
142 return args
143
144 def run(self):
145 # Parse command line arguments
146 args = self.parse_cli()
147
148 # Initialise database
149 self.db = self._setup_database(args)
150
151 # Call function
152 ret = args.func(args)
153
154 # Return with exit code
155 if ret:
156 sys.exit(ret)
157
158 # Otherwise just exit
159 sys.exit(0)
160
161 def _setup_database(self, ns):
162 """
163 Initialise the database
164 """
165 # Connect to database
166 db = location.database.Connection(
167 host=ns.database_host, database=ns.database_name,
168 user=ns.database_username, password=ns.database_password,
169 )
170
171 with db.transaction():
172 db.execute("""
173 -- announcements
174 CREATE TABLE IF NOT EXISTS announcements(network inet, autnum bigint,
175 first_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
176 last_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP);
177 CREATE UNIQUE INDEX IF NOT EXISTS announcements_networks ON announcements(network);
178 CREATE INDEX IF NOT EXISTS announcements_family ON announcements(family(network));
179 CREATE INDEX IF NOT EXISTS announcements_search ON announcements USING GIST(network inet_ops);
180
181 -- autnums
182 CREATE TABLE IF NOT EXISTS autnums(number bigint, name text NOT NULL);
183 ALTER TABLE autnums ADD COLUMN IF NOT EXISTS source text;
184 CREATE UNIQUE INDEX IF NOT EXISTS autnums_number ON autnums(number);
185
186 -- countries
187 CREATE TABLE IF NOT EXISTS countries(
188 country_code text NOT NULL, name text NOT NULL, continent_code text NOT NULL);
189 CREATE UNIQUE INDEX IF NOT EXISTS countries_country_code ON countries(country_code);
190
191 -- networks
192 CREATE TABLE IF NOT EXISTS networks(network inet, country text);
193 ALTER TABLE networks ADD COLUMN IF NOT EXISTS original_countries text[];
194 ALTER TABLE networks ADD COLUMN IF NOT EXISTS source text;
195 CREATE UNIQUE INDEX IF NOT EXISTS networks_network ON networks(network);
196 CREATE INDEX IF NOT EXISTS networks_family ON networks USING BTREE(family(network));
197 CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
198
199 -- geofeeds
200 CREATE TABLE IF NOT EXISTS geofeeds(
201 id serial primary key,
202 url text,
203 status integer default null,
204 updated_at timestamp without time zone default null
205 );
206 ALTER TABLE geofeeds ADD COLUMN IF NOT EXISTS error text;
207 CREATE UNIQUE INDEX IF NOT EXISTS geofeeds_unique
208 ON geofeeds(url);
209 CREATE TABLE IF NOT EXISTS geofeed_networks(
210 geofeed_id integer references geofeeds(id) on delete cascade,
211 network inet,
212 country text,
213 region text,
214 city text
215 );
216 CREATE INDEX IF NOT EXISTS geofeed_networks_geofeed_id
217 ON geofeed_networks(geofeed_id);
218 CREATE INDEX IF NOT EXISTS geofeed_networks_search
219 ON geofeed_networks USING GIST(network inet_ops);
220 CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text);
221 CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique
222 ON network_geofeeds(network);
223 CREATE INDEX IF NOT EXISTS network_geofeeds_search
224 ON network_geofeeds USING GIST(network inet_ops);
225 CREATE INDEX IF NOT EXISTS network_geofeeds_url
226 ON network_geofeeds(url);
227
228 -- feeds
229 CREATE TABLE IF NOT EXISTS autnum_feeds(
230 number bigint NOT NULL,
231 source text NOT NULL,
232 name text,
233 country text,
234 is_anonymous_proxy boolean,
235 is_satellite_provider boolean,
236 is_anycast boolean,
237 is_drop boolean
238 );
239 CREATE UNIQUE INDEX IF NOT EXISTS autnum_feeds_unique
240 ON autnum_feeds(number, source);
241
242 CREATE TABLE IF NOT EXISTS network_feeds(
243 network inet NOT NULL,
244 source text NOT NULL,
245 country text,
246 is_anonymous_proxy boolean,
247 is_satellite_provider boolean,
248 is_anycast boolean,
249 is_drop boolean
250 );
251 CREATE UNIQUE INDEX IF NOT EXISTS network_feeds_unique
252 ON network_feeds(network, source);
253 CREATE INDEX IF NOT EXISTS network_feeds_search
254 ON network_feeds USING GIST(network inet_ops);
255
256 -- overrides
257 CREATE TABLE IF NOT EXISTS autnum_overrides(
258 number bigint NOT NULL,
259 name text,
260 country text,
261 is_anonymous_proxy boolean,
262 is_satellite_provider boolean,
263 is_anycast boolean
264 );
265 CREATE UNIQUE INDEX IF NOT EXISTS autnum_overrides_number
266 ON autnum_overrides(number);
267 ALTER TABLE autnum_overrides ADD COLUMN IF NOT EXISTS source text;
268 ALTER TABLE autnum_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
269
270 CREATE TABLE IF NOT EXISTS network_overrides(
271 network inet NOT NULL,
272 country text,
273 is_anonymous_proxy boolean,
274 is_satellite_provider boolean,
275 is_anycast boolean
276 );
277 CREATE UNIQUE INDEX IF NOT EXISTS network_overrides_network
278 ON network_overrides(network);
279 CREATE INDEX IF NOT EXISTS network_overrides_search
280 ON network_overrides USING GIST(network inet_ops);
281 ALTER TABLE network_overrides ADD COLUMN IF NOT EXISTS source text;
282 ALTER TABLE network_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
283 """)
284
285 return db
286
287 def handle_write(self, ns):
288 """
289 Compiles a database in libloc format out of what is in the database
290 """
291 # Allocate a writer
292 writer = location.Writer(ns.signing_key, ns.backup_signing_key)
293
294 # Set all metadata
295 if ns.vendor:
296 writer.vendor = ns.vendor
297
298 if ns.description:
299 writer.description = ns.description
300
301 if ns.license:
302 writer.license = ns.license
303
304 # Add all Autonomous Systems
305 log.info("Writing Autonomous Systems...")
306
307 # Select all ASes with a name
308 rows = self.db.query("""
309 SELECT
310 autnums.number AS number,
311 COALESCE(
312 overrides.name,
313 autnums.name
314 ) AS name
315 FROM
316 autnums
317 LEFT JOIN
318 autnum_overrides overrides ON autnums.number = overrides.number
319 ORDER BY
320 autnums.number
321 """)
322
323 for row in rows:
324 # Skip AS without names
325 if not row.name:
326 continue
327
328 a = writer.add_as(row.number)
329 a.name = row.name
330
331 # Add all networks
332 log.info("Writing networks...")
333
334 # Select all known networks
335 rows = self.db.query("""
336 WITH known_networks AS (
337 SELECT network FROM announcements
338 UNION
339 SELECT network FROM networks
340 UNION
341 SELECT network FROM network_feeds
342 UNION
343 SELECT network FROM network_overrides
344 UNION
345 SELECT network FROM geofeed_networks
346 ),
347
348 ordered_networks AS (
349 SELECT
350 known_networks.network AS network,
351 announcements.autnum AS autnum,
352 networks.country AS country,
353
354 -- Must be part of returned values for ORDER BY clause
355 masklen(announcements.network) AS sort_a,
356 masklen(networks.network) AS sort_b
357 FROM
358 known_networks
359 LEFT JOIN
360 announcements ON known_networks.network <<= announcements.network
361 LEFT JOIN
362 networks ON known_networks.network <<= networks.network
363 ORDER BY
364 known_networks.network,
365 sort_a DESC,
366 sort_b DESC
367 )
368
369 -- Return a list of those networks enriched with all
370 -- other information that we store in the database
371 SELECT
372 DISTINCT ON (network)
373 network,
374 autnum,
375
376 -- Country
377 COALESCE(
378 (
379 SELECT
380 country
381 FROM
382 network_overrides overrides
383 WHERE
384 networks.network <<= overrides.network
385 ORDER BY
386 masklen(overrides.network) DESC
387 LIMIT 1
388 ),
389 (
390 SELECT
391 country
392 FROM
393 autnum_overrides overrides
394 WHERE
395 networks.autnum = overrides.number
396 ),
397 (
398 SELECT
399 country
400 FROM
401 network_feeds feeds
402 WHERE
403 networks.network <<= feeds.network
404 ORDER BY
405 masklen(feeds.network) DESC
406 LIMIT 1
407 ),
408 (
409 SELECT
410 country
411 FROM
412 autnum_feeds feeds
413 WHERE
414 networks.autnum = feeds.number
415 ORDER BY
416 source
417 LIMIT 1
418 ),
419 (
420 SELECT
421 geofeed_networks.country AS country
422 FROM
423 network_geofeeds
424
425 -- Join the data from the geofeeds
426 LEFT JOIN
427 geofeeds ON network_geofeeds.url = geofeeds.url
428 LEFT JOIN
429 geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id
430
431 -- Check whether we have a geofeed for this network
432 WHERE
433 networks.network <<= network_geofeeds.network
434 AND
435 networks.network <<= geofeed_networks.network
436
437 -- Filter for the best result
438 ORDER BY
439 masklen(geofeed_networks.network) DESC
440 LIMIT 1
441 ),
442 networks.country
443 ) AS country,
444
445 -- Flags
446 COALESCE(
447 (
448 SELECT
449 is_anonymous_proxy
450 FROM
451 network_overrides overrides
452 WHERE
453 networks.network <<= overrides.network
454 ORDER BY
455 masklen(overrides.network) DESC
456 LIMIT 1
457 ),
458 (
459 SELECT
460 is_anonymous_proxy
461 FROM
462 network_feeds feeds
463 WHERE
464 networks.network <<= feeds.network
465 ORDER BY
466 masklen(feeds.network) DESC
467 LIMIT 1
468 ),
469 (
470 SELECT
471 is_anonymous_proxy
472 FROM
473 autnum_feeds feeds
474 WHERE
475 networks.autnum = feeds.number
476 ORDER BY
477 source
478 LIMIT 1
479 ),
480 (
481 SELECT
482 is_anonymous_proxy
483 FROM
484 autnum_overrides overrides
485 WHERE
486 networks.autnum = overrides.number
487 ),
488 FALSE
489 ) AS is_anonymous_proxy,
490 COALESCE(
491 (
492 SELECT
493 is_satellite_provider
494 FROM
495 network_overrides overrides
496 WHERE
497 networks.network <<= overrides.network
498 ORDER BY
499 masklen(overrides.network) DESC
500 LIMIT 1
501 ),
502 (
503 SELECT
504 is_satellite_provider
505 FROM
506 network_feeds feeds
507 WHERE
508 networks.network <<= feeds.network
509 ORDER BY
510 masklen(feeds.network) DESC
511 LIMIT 1
512 ),
513 (
514 SELECT
515 is_satellite_provider
516 FROM
517 autnum_feeds feeds
518 WHERE
519 networks.autnum = feeds.number
520 ORDER BY
521 source
522 LIMIT 1
523 ),
524 (
525 SELECT
526 is_satellite_provider
527 FROM
528 autnum_overrides overrides
529 WHERE
530 networks.autnum = overrides.number
531 ),
532 FALSE
533 ) AS is_satellite_provider,
534 COALESCE(
535 (
536 SELECT
537 is_anycast
538 FROM
539 network_overrides overrides
540 WHERE
541 networks.network <<= overrides.network
542 ORDER BY
543 masklen(overrides.network) DESC
544 LIMIT 1
545 ),
546 (
547 SELECT
548 is_anycast
549 FROM
550 network_feeds feeds
551 WHERE
552 networks.network <<= feeds.network
553 ORDER BY
554 masklen(feeds.network) DESC
555 LIMIT 1
556 ),
557 (
558 SELECT
559 is_anycast
560 FROM
561 autnum_feeds feeds
562 WHERE
563 networks.autnum = feeds.number
564 ORDER BY
565 source
566 LIMIT 1
567 ),
568 (
569 SELECT
570 is_anycast
571 FROM
572 autnum_overrides overrides
573 WHERE
574 networks.autnum = overrides.number
575 ),
576 FALSE
577 ) AS is_anycast,
578 COALESCE(
579 (
580 SELECT
581 is_drop
582 FROM
583 network_overrides overrides
584 WHERE
585 networks.network <<= overrides.network
586 ORDER BY
587 masklen(overrides.network) DESC
588 LIMIT 1
589 ),
590 (
591 SELECT
592 is_drop
593 FROM
594 network_feeds feeds
595 WHERE
596 networks.network <<= feeds.network
597 ORDER BY
598 masklen(feeds.network) DESC
599 LIMIT 1
600 ),
601 (
602 SELECT
603 is_drop
604 FROM
605 autnum_feeds feeds
606 WHERE
607 networks.autnum = feeds.number
608 ORDER BY
609 source
610 LIMIT 1
611 ),
612 (
613 SELECT
614 is_drop
615 FROM
616 autnum_overrides overrides
617 WHERE
618 networks.autnum = overrides.number
619 ),
620 FALSE
621 ) AS is_drop
622 FROM
623 ordered_networks networks
624 """)
625
626 for row in rows:
627 network = writer.add_network(row.network)
628
629 # Save country
630 if row.country:
631 network.country_code = row.country
632
633 # Save ASN
634 if row.autnum:
635 network.asn = row.autnum
636
637 # Set flags
638 if row.is_anonymous_proxy:
639 network.set_flag(location.NETWORK_FLAG_ANONYMOUS_PROXY)
640
641 if row.is_satellite_provider:
642 network.set_flag(location.NETWORK_FLAG_SATELLITE_PROVIDER)
643
644 if row.is_anycast:
645 network.set_flag(location.NETWORK_FLAG_ANYCAST)
646
647 if row.is_drop:
648 network.set_flag(location.NETWORK_FLAG_DROP)
649
650 # Add all countries
651 log.info("Writing countries...")
652 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
653
654 for row in rows:
655 c = writer.add_country(row.country_code)
656 c.continent_code = row.continent_code
657 c.name = row.name
658
659 # Write everything to file
660 log.info("Writing database to file...")
661 for file in ns.file:
662 writer.write(file)
663
664 def handle_update_whois(self, ns):
665 downloader = location.importer.Downloader()
666
667 # Did we run successfully?
668 error = False
669
670 # Fetch all valid country codes to check parsed networks against
671 validcountries = self.countries
672
673 # Iterate over all potential sources
674 for source in sorted(location.importer.SOURCES):
675 with self.db.transaction():
676 # Create some temporary tables to store parsed data
677 self.db.execute("""
678 CREATE TEMPORARY TABLE _autnums(number integer NOT NULL,
679 organization text NOT NULL, source text NOT NULL) ON COMMIT DROP;
680 CREATE UNIQUE INDEX _autnums_number ON _autnums(number);
681
682 CREATE TEMPORARY TABLE _organizations(handle text NOT NULL,
683 name text NOT NULL, source text NOT NULL) ON COMMIT DROP;
684 CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle);
685
686 CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text NOT NULL,
687 original_countries text[] NOT NULL, source text NOT NULL)
688 ON COMMIT DROP;
689 CREATE INDEX _rirdata_search ON _rirdata
690 USING BTREE(family(network), masklen(network));
691 CREATE UNIQUE INDEX _rirdata_network ON _rirdata(network);
692 """)
693
694 # Remove all previously imported content
695 self.db.execute("DELETE FROM autnums WHERE source = %s", source)
696 self.db.execute("DELETE FROM networks WHERE source = %s", source)
697
698 try:
699 # Fetch WHOIS sources
700 for url in location.importer.WHOIS_SOURCES.get(source, []):
701 for block in downloader.request_blocks(url):
702 self._parse_block(block, source, validcountries)
703
704 # Fetch extended sources
705 for url in location.importer.EXTENDED_SOURCES.get(source, []):
706 for line in downloader.request_lines(url):
707 self._parse_line(line, source, validcountries)
708 except urllib.error.URLError as e:
709 log.error("Could not retrieve data from %s: %s" % (source, e))
710 error = True
711
712 # Continue with the next source
713 continue
714
715 # Process all parsed networks from every RIR we happen to have access to,
716 # insert the largest network chunks into the networks table immediately...
717 families = self.db.query("""
718 SELECT DISTINCT
719 family(network) AS family
720 FROM
721 _rirdata
722 ORDER BY
723 family(network)
724 """,
725 )
726
727 for family in (row.family for row in families):
728 # Fetch the smallest mask length in our data set
729 smallest = self.db.get("""
730 SELECT
731 MIN(
732 masklen(network)
733 ) AS prefix
734 FROM
735 _rirdata
736 WHERE
737 family(network) = %s
738 """, family,
739 )
740
741 # Copy all networks
742 self.db.execute("""
743 INSERT INTO
744 networks
745 (
746 network,
747 country,
748 original_countries,
749 source
750 )
751 SELECT
752 network,
753 country,
754 original_countries,
755 source
756 FROM
757 _rirdata
758 WHERE
759 masklen(network) = %s
760 AND
761 family(network) = %s
762 ON CONFLICT DO
763 NOTHING""",
764 smallest.prefix,
765 family,
766 )
767
768 # ... determine any other prefixes for this network family, ...
769 prefixes = self.db.query("""
770 SELECT
771 DISTINCT masklen(network) AS prefix
772 FROM
773 _rirdata
774 WHERE
775 family(network) = %s
776 ORDER BY
777 masklen(network) ASC
778 OFFSET 1
779 """, family,
780 )
781
782 # ... and insert networks with this prefix in case they provide additional
783 # information (i. e. subnet of a larger chunk with a different country)
784 for prefix in (row.prefix for row in prefixes):
785 self.db.execute("""
786 WITH candidates AS (
787 SELECT
788 _rirdata.network,
789 _rirdata.country,
790 _rirdata.original_countries,
791 _rirdata.source
792 FROM
793 _rirdata
794 WHERE
795 family(_rirdata.network) = %s
796 AND
797 masklen(_rirdata.network) = %s
798 ),
799 filtered AS (
800 SELECT
801 DISTINCT ON (c.network)
802 c.network,
803 c.country,
804 c.original_countries,
805 c.source,
806 masklen(networks.network),
807 networks.country AS parent_country
808 FROM
809 candidates c
810 LEFT JOIN
811 networks
812 ON
813 c.network << networks.network
814 ORDER BY
815 c.network,
816 masklen(networks.network) DESC NULLS LAST
817 )
818 INSERT INTO
819 networks(network, country, original_countries, source)
820 SELECT
821 network,
822 country,
823 original_countries,
824 source
825 FROM
826 filtered
827 WHERE
828 parent_country IS NULL
829 OR
830 country <> parent_country
831 ON CONFLICT DO NOTHING
832 """, family, prefix,
833 )
834
835 self.db.execute("""
836 INSERT INTO
837 autnums
838 (
839 number,
840 name,
841 source
842 )
843 SELECT
844 _autnums.number,
845 _organizations.name,
846 _organizations.source
847 FROM
848 _autnums
849 JOIN
850 _organizations ON _autnums.organization = _organizations.handle
851 ON CONFLICT
852 (
853 number
854 )
855 DO UPDATE
856 SET name = excluded.name
857 """,
858 )
859
860 # Download and import (technical) AS names from ARIN
861 with self.db.transaction():
862 self._import_as_names_from_arin(downloader)
863
864 # Return a non-zero exit code for errors
865 return 1 if error else 0
866
867 def _check_parsed_network(self, network):
868 """
869 Assistive function to detect and subsequently sort out parsed
870 networks from RIR data (both Whois and so-called "extended sources"),
871 which are or have...
872
873 (a) not globally routable (RFC 1918 space, et al.)
874 (b) covering a too large chunk of the IP address space (prefix length
875 is < 7 for IPv4 networks, and < 10 for IPv6)
876 (c) "0.0.0.0" or "::" as a network address
877 (d) are too small for being publicly announced (we have decided not to
878 process them at the moment, as they significantly enlarge our
879 database without providing very helpful additional information)
880
881 This unfortunately is necessary due to brain-dead clutter across
882 various RIR databases, causing mismatches and eventually disruptions.
883
884 We will return False in case a network is not suitable for adding
885 it to our database, and True otherwise.
886 """
887
888 if not network or not (isinstance(network, ipaddress.IPv4Network) or isinstance(network, ipaddress.IPv6Network)):
889 return False
890
891 if not network.is_global:
892 log.debug("Skipping non-globally routable network: %s" % network)
893 return False
894
895 if network.version == 4:
896 if network.prefixlen < 7:
897 log.debug("Skipping too big IP chunk: %s" % network)
898 return False
899
900 if network.prefixlen > 24:
901 log.debug("Skipping network too small to be publicly announced: %s" % network)
902 return False
903
904 if str(network.network_address) == "0.0.0.0":
905 log.debug("Skipping network based on 0.0.0.0: %s" % network)
906 return False
907
908 elif network.version == 6:
909 if network.prefixlen < 10:
910 log.debug("Skipping too big IP chunk: %s" % network)
911 return False
912
913 if network.prefixlen > 48:
914 log.debug("Skipping network too small to be publicly announced: %s" % network)
915 return False
916
917 if str(network.network_address) == "::":
918 log.debug("Skipping network based on '::': %s" % network)
919 return False
920
921 else:
922 # This should not happen...
923 log.warning("Skipping network of unknown family, this should not happen: %s" % network)
924 return False
925
926 # In case we have made it here, the network is considered to
927 # be suitable for libloc consumption...
928 return True
929
930 def _check_parsed_asn(self, asn):
931 """
932 Assistive function to filter Autonomous System Numbers not being suitable
933 for adding to our database. Returns False in such cases, and True otherwise.
934 """
935
936 for start, end in VALID_ASN_RANGES:
937 if start <= asn and end >= asn:
938 return True
939
940 log.info("Supplied ASN %s out of publicly routable ASN ranges" % asn)
941 return False
942
943 def _parse_block(self, block, source_key, validcountries = None):
944 # Get first line to find out what type of block this is
945 line = block[0]
946
947 # aut-num
948 if line.startswith("aut-num:"):
949 return self._parse_autnum_block(block, source_key)
950
951 # inetnum
952 if line.startswith("inet6num:") or line.startswith("inetnum:"):
953 return self._parse_inetnum_block(block, source_key, validcountries)
954
955 # organisation
956 elif line.startswith("organisation:"):
957 return self._parse_org_block(block, source_key)
958
959 def _parse_autnum_block(self, block, source_key):
960 autnum = {}
961 for line in block:
962 # Split line
963 key, val = split_line(line)
964
965 if key == "aut-num":
966 m = re.match(r"^(AS|as)(\d+)", val)
967 if m:
968 autnum["asn"] = m.group(2)
969
970 elif key == "org":
971 autnum[key] = val.upper()
972
973 elif key == "descr":
974 # Save the first description line as well...
975 if not key in autnum:
976 autnum[key] = val
977
978 # Skip empty objects
979 if not autnum or not "asn" in autnum:
980 return
981
982 # Insert a dummy organisation handle into our temporary organisations
983 # table in case the AS does not have an organisation handle set, but
984 # has a description (a quirk often observed in APNIC area), so we can
985 # later display at least some string for this AS.
986 if not "org" in autnum:
987 if "descr" in autnum:
988 autnum["org"] = "LIBLOC-%s-ORGHANDLE" % autnum.get("asn")
989
990 self.db.execute("INSERT INTO _organizations(handle, name, source) \
991 VALUES(%s, %s, %s) ON CONFLICT (handle) DO NOTHING",
992 autnum.get("org"), autnum.get("descr"), source_key,
993 )
994 else:
995 log.warning("ASN %s neither has an organisation handle nor a description line set, omitting" % \
996 autnum.get("asn"))
997 return
998
999 # Insert into database
1000 self.db.execute("INSERT INTO _autnums(number, organization, source) \
1001 VALUES(%s, %s, %s) ON CONFLICT (number) DO UPDATE SET \
1002 organization = excluded.organization",
1003 autnum.get("asn"), autnum.get("org"), source_key,
1004 )
1005
1006 def _parse_inetnum_block(self, block, source_key, validcountries = None):
1007 log.debug("Parsing inetnum block:")
1008
1009 inetnum = {}
1010 for line in block:
1011 log.debug(line)
1012
1013 # Split line
1014 key, val = split_line(line)
1015
1016 # Filter any inetnum records which are only referring to IP space
1017 # not managed by that specific RIR...
1018 if key == "netname":
1019 if re.match(r"^(ERX-NETBLOCK|(AFRINIC|ARIN|LACNIC|RIPE)-CIDR-BLOCK|IANA-NETBLOCK-\d{1,3}|NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK|STUB-[\d-]{3,}SLASH\d{1,2})", val.strip()):
1020 log.debug("Skipping record indicating historic/orphaned data: %s" % val.strip())
1021 return
1022
1023 if key == "inetnum":
1024 start_address, delim, end_address = val.partition("-")
1025
1026 # Strip any excess space
1027 start_address, end_address = start_address.rstrip(), end_address.strip()
1028
1029 # Handle "inetnum" formatting in LACNIC DB (e.g. "24.152.8/22" instead of "24.152.8.0/22")
1030 if start_address and not (delim or end_address):
1031 try:
1032 start_address = ipaddress.ip_network(start_address, strict=False)
1033 except ValueError:
1034 start_address = start_address.split("/")
1035 ldigits = start_address[0].count(".")
1036
1037 # How many octets do we need to add?
1038 # (LACNIC does not seem to have a /8 or greater assigned, so the following should suffice.)
1039 if ldigits == 1:
1040 start_address = start_address[0] + ".0.0/" + start_address[1]
1041 elif ldigits == 2:
1042 start_address = start_address[0] + ".0/" + start_address[1]
1043 else:
1044 log.warning("Could not recover IPv4 address from line in LACNIC DB format: %s" % line)
1045 return
1046
1047 try:
1048 start_address = ipaddress.ip_network(start_address, strict=False)
1049 except ValueError:
1050 log.warning("Could not parse line in LACNIC DB format: %s" % line)
1051 return
1052
1053 # Enumerate first and last IP address of this network
1054 end_address = start_address[-1]
1055 start_address = start_address[0]
1056
1057 else:
1058 # Convert to IP address
1059 try:
1060 start_address = ipaddress.ip_address(start_address)
1061 end_address = ipaddress.ip_address(end_address)
1062 except ValueError:
1063 log.warning("Could not parse line: %s" % line)
1064 return
1065
1066 inetnum["inetnum"] = list(ipaddress.summarize_address_range(start_address, end_address))
1067
1068 elif key == "inet6num":
1069 inetnum[key] = [ipaddress.ip_network(val, strict=False)]
1070
1071 elif key == "country":
1072 val = val.upper()
1073
1074 # Catch RIR data objects with more than one country code...
1075 if not key in inetnum:
1076 inetnum[key] = []
1077 else:
1078 if val in inetnum.get("country"):
1079 # ... but keep this list distinct...
1080 continue
1081
1082 # When people set country codes to "UK", they actually mean "GB"
1083 if val == "UK":
1084 val = "GB"
1085
1086 inetnum[key].append(val)
1087
1088 # Parse the geofeed attribute
1089 elif key == "geofeed":
1090 inetnum["geofeed"] = val
1091
1092 # Parse geofeed when used as a remark
1093 elif key == "remarks":
1094 m = re.match(r"^(?:Geofeed)\s+(https://.*)", val)
1095 if m:
1096 inetnum["geofeed"] = m.group(1)
1097
1098 # Skip empty objects
1099 if not inetnum or not "country" in inetnum:
1100 return
1101
1102 # Prepare skipping objects with unknown country codes...
1103 invalidcountries = [singlecountry for singlecountry in inetnum.get("country") if singlecountry not in validcountries]
1104
1105 # Iterate through all networks enumerated from above, check them for plausibility and insert
1106 # them into the database, if _check_parsed_network() succeeded
1107 for single_network in inetnum.get("inet6num") or inetnum.get("inetnum"):
1108 if self._check_parsed_network(single_network):
1109 # Skip objects with unknown country codes if they are valid to avoid log spam...
1110 if validcountries and invalidcountries:
1111 log.warning("Skipping network with bogus countr(y|ies) %s (original countries: %s): %s" % \
1112 (invalidcountries, inetnum.get("country"), inetnum.get("inet6num") or inetnum.get("inetnum")))
1113 break
1114
1115 # Everything is fine here, run INSERT statement...
1116 self.db.execute("INSERT INTO _rirdata(network, country, original_countries, source) \
1117 VALUES(%s, %s, %s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
1118 "%s" % single_network, inetnum.get("country")[0], inetnum.get("country"), source_key,
1119 )
1120
1121 # Update any geofeed information
1122 geofeed = inetnum.get("geofeed", None)
1123 if geofeed:
1124 self._parse_geofeed(geofeed, single_network)
1125
1126 # Delete any previous geofeeds
1127 else:
1128 self.db.execute("DELETE FROM network_geofeeds WHERE network = %s",
1129 "%s" % single_network)
1130
1131 def _parse_geofeed(self, url, single_network):
1132 # Parse the URL
1133 url = urllib.parse.urlparse(url)
1134
1135 # Make sure that this is a HTTPS URL
1136 if not url.scheme == "https":
1137 log.debug("Geofeed URL is not using HTTPS: %s" % geofeed)
1138 return
1139
1140 # Put the URL back together normalized
1141 url = url.geturl()
1142
1143 # Store/update any geofeeds
1144 self.db.execute("""
1145 INSERT INTO
1146 network_geofeeds(
1147 network,
1148 url
1149 )
1150 VALUES(
1151 %s, %s
1152 )
1153 ON CONFLICT (network) DO
1154 UPDATE SET url = excluded.url""",
1155 "%s" % single_network, url,
1156 )
1157
1158 def _parse_org_block(self, block, source_key):
1159 org = {}
1160 for line in block:
1161 # Split line
1162 key, val = split_line(line)
1163
1164 if key == "organisation":
1165 org[key] = val.upper()
1166 elif key == "org-name":
1167 org[key] = val
1168
1169 # Skip empty objects
1170 if not org:
1171 return
1172
1173 self.db.execute("INSERT INTO _organizations(handle, name, source) \
1174 VALUES(%s, %s, %s) ON CONFLICT (handle) DO \
1175 UPDATE SET name = excluded.name",
1176 org.get("organisation"), org.get("org-name"), source_key,
1177 )
1178
1179 def _parse_line(self, line, source_key, validcountries = None):
1180 # Skip version line
1181 if line.startswith("2"):
1182 return
1183
1184 # Skip comments
1185 if line.startswith("#"):
1186 return
1187
1188 try:
1189 registry, country_code, type, line = line.split("|", 3)
1190 except:
1191 log.warning("Could not parse line: %s" % line)
1192 return
1193
1194 # Skip any lines that are for stats only or do not have a country
1195 # code at all (avoids log spam below)
1196 if not country_code or country_code == '*':
1197 return
1198
1199 # Skip objects with unknown country codes
1200 if validcountries and country_code not in validcountries:
1201 log.warning("Skipping line with bogus country '%s': %s" % \
1202 (country_code, line))
1203 return
1204
1205 if type in ("ipv6", "ipv4"):
1206 return self._parse_ip_line(country_code, type, line, source_key)
1207
1208 def _parse_ip_line(self, country, type, line, source_key):
1209 try:
1210 address, prefix, date, status, organization = line.split("|")
1211 except ValueError:
1212 organization = None
1213
1214 # Try parsing the line without organization
1215 try:
1216 address, prefix, date, status = line.split("|")
1217 except ValueError:
1218 log.warning("Unhandled line format: %s" % line)
1219 return
1220
1221 # Skip anything that isn't properly assigned
1222 if not status in ("assigned", "allocated"):
1223 return
1224
1225 # Cast prefix into an integer
1226 try:
1227 prefix = int(prefix)
1228 except:
1229 log.warning("Invalid prefix: %s" % prefix)
1230 return
1231
1232 # Fix prefix length for IPv4
1233 if type == "ipv4":
1234 prefix = 32 - int(math.log(prefix, 2))
1235
1236 # Try to parse the address
1237 try:
1238 network = ipaddress.ip_network("%s/%s" % (address, prefix), strict=False)
1239 except ValueError:
1240 log.warning("Invalid IP address: %s" % address)
1241 return
1242
1243 if not self._check_parsed_network(network):
1244 return
1245
1246 self.db.execute("INSERT INTO networks(network, country, original_countries, source) \
1247 VALUES(%s, %s, %s, %s) ON CONFLICT (network) DO \
1248 UPDATE SET country = excluded.country",
1249 "%s" % network, country, [country], source_key,
1250 )
1251
1252 def _import_as_names_from_arin(self, downloader):
1253 # Delete all previously imported content
1254 self.db.execute("DELETE FROM autnums WHERE source = %s", "ARIN")
1255
1256 # Try to retrieve the feed from ftp.arin.net
1257 feed = downloader.request_lines("https://ftp.arin.net/pub/resource_registry_service/asns.csv")
1258
1259 # Walk through the file
1260 for line in csv.DictReader(feed, dialect="arin"):
1261 log.debug("Processing object: %s" % line)
1262
1263 # Fetch status
1264 status = line.get("Status")
1265
1266 # We are only interested in anything managed by ARIN
1267 if not status == "Full Registry Services":
1268 continue
1269
1270 # Fetch organization name
1271 name = line.get("Org Name")
1272
1273 # Extract ASNs
1274 first_asn = line.get("Start AS Number")
1275 last_asn = line.get("End AS Number")
1276
1277 # Cast to a number
1278 try:
1279 first_asn = int(first_asn)
1280 except TypeError as e:
1281 log.warning("Could not parse ASN '%s'" % first_asn)
1282 continue
1283
1284 try:
1285 last_asn = int(last_asn)
1286 except TypeError as e:
1287 log.warning("Could not parse ASN '%s'" % last_asn)
1288 continue
1289
1290 # Check if the range is valid
1291 if last_asn < first_asn:
1292 log.warning("Invalid ASN range %s-%s" % (first_asn, last_asn))
1293
1294 # Insert everything into the database
1295 for asn in range(first_asn, last_asn + 1):
1296 if not self._check_parsed_asn(asn):
1297 log.warning("Skipping invalid ASN %s" % asn)
1298 continue
1299
1300 self.db.execute("""
1301 INSERT INTO
1302 autnums
1303 (
1304 number,
1305 name,
1306 source
1307 )
1308 VALUES
1309 (
1310 %s, %s, %s
1311 )
1312 ON CONFLICT
1313 (
1314 number
1315 )
1316 DO NOTHING
1317 """, asn, name, "ARIN",
1318 )
1319
1320 def handle_update_announcements(self, ns):
1321 server = ns.server[0]
1322
1323 with self.db.transaction():
1324 if server.startswith("/"):
1325 self._handle_update_announcements_from_bird(server)
1326
1327 # Purge anything we never want here
1328 self.db.execute("""
1329 -- Delete default routes
1330 DELETE FROM announcements WHERE network = '::/0' OR network = '0.0.0.0/0';
1331
1332 -- Delete anything that is not global unicast address space
1333 DELETE FROM announcements WHERE family(network) = 6 AND NOT network <<= '2000::/3';
1334
1335 -- DELETE "current network" address space
1336 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '0.0.0.0/8';
1337
1338 -- DELETE local loopback address space
1339 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '127.0.0.0/8';
1340
1341 -- DELETE RFC 1918 address space
1342 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '10.0.0.0/8';
1343 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '172.16.0.0/12';
1344 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.168.0.0/16';
1345
1346 -- DELETE test, benchmark and documentation address space
1347 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.0.0/24';
1348 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.2.0/24';
1349 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.18.0.0/15';
1350 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.51.100.0/24';
1351 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '203.0.113.0/24';
1352
1353 -- DELETE CGNAT address space (RFC 6598)
1354 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '100.64.0.0/10';
1355
1356 -- DELETE link local address space
1357 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '169.254.0.0/16';
1358
1359 -- DELETE IPv6 to IPv4 (6to4) address space (RFC 3068)
1360 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.88.99.0/24';
1361 DELETE FROM announcements WHERE family(network) = 6 AND network <<= '2002::/16';
1362
1363 -- DELETE multicast and reserved address space
1364 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '224.0.0.0/4';
1365 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '240.0.0.0/4';
1366
1367 -- Delete networks that are too small to be in the global routing table
1368 DELETE FROM announcements WHERE family(network) = 6 AND masklen(network) > 48;
1369 DELETE FROM announcements WHERE family(network) = 4 AND masklen(network) > 24;
1370
1371 -- Delete any non-public or reserved ASNs
1372 DELETE FROM announcements WHERE NOT (
1373 (autnum >= 1 AND autnum <= 23455)
1374 OR
1375 (autnum >= 23457 AND autnum <= 64495)
1376 OR
1377 (autnum >= 131072 AND autnum <= 4199999999)
1378 );
1379
1380 -- Delete everything that we have not seen for 14 days
1381 DELETE FROM announcements WHERE last_seen_at <= CURRENT_TIMESTAMP - INTERVAL '14 days';
1382 """)
1383
1384 def _handle_update_announcements_from_bird(self, server):
1385 # Pre-compile the regular expression for faster searching
1386 route = re.compile(b"^\s(.+?)\s+.+?\[(?:AS(.*?))?.\]$")
1387
1388 log.info("Requesting routing table from Bird (%s)" % server)
1389
1390 aggregated_networks = []
1391
1392 # Send command to list all routes
1393 for line in self._bird_cmd(server, "show route"):
1394 m = route.match(line)
1395 if not m:
1396 # Skip empty lines
1397 if not line:
1398 pass
1399
1400 # Ignore any header lines with the name of the routing table
1401 elif line.startswith(b"Table"):
1402 pass
1403
1404 # Log anything else
1405 else:
1406 log.debug("Could not parse line: %s" % line.decode())
1407
1408 continue
1409
1410 # Fetch the extracted network and ASN
1411 network, autnum = m.groups()
1412
1413 # Decode into strings
1414 if network:
1415 network = network.decode()
1416 if autnum:
1417 autnum = autnum.decode()
1418
1419 # Collect all aggregated networks
1420 if not autnum:
1421 log.debug("%s is an aggregated network" % network)
1422 aggregated_networks.append(network)
1423 continue
1424
1425 # Insert it into the database
1426 self.db.execute("INSERT INTO announcements(network, autnum) \
1427 VALUES(%s, %s) ON CONFLICT (network) DO \
1428 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1429 network, autnum,
1430 )
1431
1432 # Process any aggregated networks
1433 for network in aggregated_networks:
1434 log.debug("Processing aggregated network %s" % network)
1435
1436 # Run "show route all" for each network
1437 for line in self._bird_cmd(server, "show route %s all" % network):
1438 # Try finding the path
1439 m = re.match(b"\s+BGP\.as_path:.* (\d+) {\d+}$", line)
1440 if m:
1441 # Select the last AS number in the path
1442 autnum = m.group(1).decode()
1443
1444 # Insert it into the database
1445 self.db.execute("INSERT INTO announcements(network, autnum) \
1446 VALUES(%s, %s) ON CONFLICT (network) DO \
1447 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1448 network, autnum,
1449 )
1450
1451 # We don't need to process any more
1452 break
1453
1454 def _bird_cmd(self, socket_path, command):
1455 # Connect to the socket
1456 s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
1457 s.connect(socket_path)
1458
1459 # Allocate some buffer
1460 buffer = b""
1461
1462 log.debug("Sending Bird command: %s" % command)
1463
1464 # Send the command
1465 s.send(b"%s\n" % command.encode())
1466
1467 while True:
1468 # Fill up the buffer
1469 buffer += s.recv(4096)
1470
1471 while True:
1472 # Search for the next newline
1473 pos = buffer.find(b"\n")
1474
1475 # If we cannot find one, we go back and read more data
1476 if pos <= 0:
1477 break
1478
1479 # Cut after the newline character
1480 pos += 1
1481
1482 # Split the line we want and keep the rest in buffer
1483 line, buffer = buffer[:pos], buffer[pos:]
1484
1485 # Try parsing any status lines
1486 if len(line) > 4 and line[:4].isdigit() and line[4] in (32, 45):
1487 code, delim, line = int(line[:4]), line[4], line[5:]
1488
1489 log.debug("Received response code %s from bird" % code)
1490
1491 # End of output
1492 if code == 0:
1493 return
1494
1495 # Ignore hello line
1496 elif code == 1:
1497 continue
1498
1499 # Otherwise return the line
1500 yield line
1501
1502 def handle_update_geofeeds(self, ns):
1503 # Sync geofeeds
1504 with self.db.transaction():
1505 # Delete all geofeeds which are no longer linked
1506 self.db.execute("""
1507 DELETE FROM
1508 geofeeds
1509 WHERE
1510 NOT EXISTS (
1511 SELECT
1512 1
1513 FROM
1514 network_geofeeds
1515 WHERE
1516 geofeeds.url = network_geofeeds.url
1517 )""",
1518 )
1519
1520 # Copy all geofeeds
1521 self.db.execute("""
1522 INSERT INTO
1523 geofeeds(
1524 url
1525 )
1526 SELECT
1527 url
1528 FROM
1529 network_geofeeds
1530 ON CONFLICT (url)
1531 DO NOTHING
1532 """,
1533 )
1534
1535 # Fetch all Geofeeds that require an update
1536 geofeeds = self.db.query("""
1537 SELECT
1538 id,
1539 url
1540 FROM
1541 geofeeds
1542 WHERE
1543 updated_at IS NULL
1544 OR
1545 updated_at <= CURRENT_TIMESTAMP - INTERVAL '1 week'
1546 ORDER BY
1547 id
1548 """)
1549
1550 with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
1551 results = executor.map(self._fetch_geofeed, geofeeds)
1552
1553 # Fetch all results to raise any exceptions
1554 for result in results:
1555 pass
1556
1557 # Delete data from any feeds that did not update in the last two weeks
1558 with self.db.transaction():
1559 self.db.execute("""
1560 DELETE FROM
1561 geofeed_networks
1562 WHERE
1563 geofeed_networks.geofeed_id IN (
1564 SELECT
1565 geofeeds.id
1566 FROM
1567 geofeeds
1568 WHERE
1569 updated_at IS NULL
1570 OR
1571 updated_at <= CURRENT_TIMESTAMP - INTERVAL '2 weeks'
1572 )
1573 """)
1574
1575 def _fetch_geofeed(self, geofeed):
1576 log.debug("Fetching Geofeed %s" % geofeed.url)
1577
1578 with self.db.transaction():
1579 # Open the URL
1580 try:
1581 req = urllib.request.Request(geofeed.url, headers={
1582 "User-Agent" : "location/%s" % location.__version__,
1583
1584 # We expect some plain text file in CSV format
1585 "Accept" : "text/csv, text/plain",
1586 })
1587
1588 # XXX set proxy
1589
1590 # Send the request
1591 with urllib.request.urlopen(req, timeout=10) as f:
1592 # Remove any previous data
1593 self.db.execute("DELETE FROM geofeed_networks \
1594 WHERE geofeed_id = %s", geofeed.id)
1595
1596 lineno = 0
1597
1598 # Read the output line by line
1599 for line in f:
1600 lineno += 1
1601
1602 try:
1603 line = line.decode()
1604
1605 # Ignore any lines we cannot decode
1606 except UnicodeDecodeError:
1607 log.debug("Could not decode line %s in %s" \
1608 % (lineno, geofeed.url))
1609 continue
1610
1611 # Strip any newline
1612 line = line.rstrip()
1613
1614 # Skip empty lines
1615 if not line:
1616 continue
1617
1618 # Try to parse the line
1619 try:
1620 fields = line.split(",", 5)
1621 except ValueError:
1622 log.debug("Could not parse line: %s" % line)
1623 continue
1624
1625 # Check if we have enough fields
1626 if len(fields) < 4:
1627 log.debug("Not enough fields in line: %s" % line)
1628 continue
1629
1630 # Fetch all fields
1631 network, country, region, city, = fields[:4]
1632
1633 # Try to parse the network
1634 try:
1635 network = ipaddress.ip_network(network, strict=False)
1636 except ValueError:
1637 log.debug("Could not parse network: %s" % network)
1638 continue
1639
1640 # Strip any excess whitespace from country codes
1641 country = country.strip()
1642
1643 # Make the country code uppercase
1644 country = country.upper()
1645
1646 # Check the country code
1647 if not country:
1648 log.debug("Empty country code in Geofeed %s line %s" \
1649 % (geofeed.url, lineno))
1650 continue
1651
1652 elif not location.country_code_is_valid(country):
1653 log.debug("Invalid country code in Geofeed %s:%s: %s" \
1654 % (geofeed.url, lineno, country))
1655 continue
1656
1657 # Write this into the database
1658 self.db.execute("""
1659 INSERT INTO
1660 geofeed_networks (
1661 geofeed_id,
1662 network,
1663 country,
1664 region,
1665 city
1666 )
1667 VALUES (%s, %s, %s, %s, %s)""",
1668 geofeed.id,
1669 "%s" % network,
1670 country,
1671 region,
1672 city,
1673 )
1674
1675 # Catch any HTTP errors
1676 except urllib.request.HTTPError as e:
1677 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1678 WHERE id = %s", e.code, "%s" % e, geofeed.id)
1679
1680 # Remove any previous data when the feed has been deleted
1681 if e.code == 404:
1682 self.db.execute("DELETE FROM geofeed_networks \
1683 WHERE geofeed_id = %s", geofeed.id)
1684
1685 # Catch any other errors and connection timeouts
1686 except (http.client.InvalidURL, urllib.request.URLError, TimeoutError) as e:
1687 log.debug("Could not fetch URL %s: %s" % (geofeed.url, e))
1688
1689 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1690 WHERE id = %s", 599, "%s" % e, geofeed.id)
1691
1692 # Mark the geofeed as updated
1693 else:
1694 self.db.execute("""
1695 UPDATE
1696 geofeeds
1697 SET
1698 updated_at = CURRENT_TIMESTAMP,
1699 status = NULL,
1700 error = NULL
1701 WHERE
1702 id = %s""",
1703 geofeed.id,
1704 )
1705
1706 def handle_update_overrides(self, ns):
1707 with self.db.transaction():
1708 # Only drop manually created overrides, as we can be reasonably sure to have them,
1709 # and preserve the rest. If appropriate, it is deleted by correspondent functions.
1710 self.db.execute("""
1711 DELETE FROM autnum_overrides WHERE source = 'manual';
1712 DELETE FROM network_overrides WHERE source = 'manual';
1713 """)
1714
1715 for file in ns.files:
1716 log.info("Reading %s..." % file)
1717
1718 with open(file, "rb") as f:
1719 for type, block in location.importer.read_blocks(f):
1720 if type == "net":
1721 network = block.get("net")
1722 # Try to parse and normalise the network
1723 try:
1724 network = ipaddress.ip_network(network, strict=False)
1725 except ValueError as e:
1726 log.warning("Invalid IP network: %s: %s" % (network, e))
1727 continue
1728
1729 # Prevent that we overwrite all networks
1730 if network.prefixlen == 0:
1731 log.warning("Skipping %s: You cannot overwrite default" % network)
1732 continue
1733
1734 self.db.execute("""
1735 INSERT INTO network_overrides(
1736 network,
1737 country,
1738 source,
1739 is_anonymous_proxy,
1740 is_satellite_provider,
1741 is_anycast,
1742 is_drop
1743 ) VALUES (%s, %s, %s, %s, %s, %s, %s)
1744 ON CONFLICT (network) DO NOTHING""",
1745 "%s" % network,
1746 block.get("country"),
1747 "manual",
1748 self._parse_bool(block, "is-anonymous-proxy"),
1749 self._parse_bool(block, "is-satellite-provider"),
1750 self._parse_bool(block, "is-anycast"),
1751 self._parse_bool(block, "drop"),
1752 )
1753
1754 elif type == "aut-num":
1755 autnum = block.get("aut-num")
1756
1757 # Check if AS number begins with "AS"
1758 if not autnum.startswith("AS"):
1759 log.warning("Invalid AS number: %s" % autnum)
1760 continue
1761
1762 # Strip "AS"
1763 autnum = autnum[2:]
1764
1765 self.db.execute("""
1766 INSERT INTO autnum_overrides(
1767 number,
1768 name,
1769 country,
1770 source,
1771 is_anonymous_proxy,
1772 is_satellite_provider,
1773 is_anycast,
1774 is_drop
1775 ) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)
1776 ON CONFLICT DO NOTHING""",
1777 autnum,
1778 block.get("name"),
1779 block.get("country"),
1780 "manual",
1781 self._parse_bool(block, "is-anonymous-proxy"),
1782 self._parse_bool(block, "is-satellite-provider"),
1783 self._parse_bool(block, "is-anycast"),
1784 self._parse_bool(block, "drop"),
1785 )
1786
1787 else:
1788 log.warning("Unsupported type: %s" % type)
1789
1790 def handle_update_feeds(self, ns):
1791 """
1792 Update any third-party feeds
1793 """
1794 success = True
1795
1796 # Create a downloader
1797 downloader = location.importer.Downloader()
1798
1799 feeds = (
1800 # AWS IP Ranges
1801 ("AWS-IP-RANGES", self._import_aws_ip_ranges, "https://ip-ranges.amazonaws.com/ip-ranges.json"),
1802 )
1803
1804 # Walk through all feeds
1805 for name, callback, url, *args in feeds:
1806 try:
1807 self._process_feed(downloader, name, callback, url, *args)
1808
1809 # Log an error but continue if an exception occurs
1810 except Exception as e:
1811 log.error("Error processing feed '%s': %s" % (name, e))
1812 success = False
1813
1814 # Spamhaus
1815 self._update_feed_for_spamhaus_drop()
1816
1817 # Return status
1818 return 0 if success else 1
1819
1820 def _process_feed(self, downloader, name, callback, url, *args):
1821 """
1822 Processes one feed
1823 """
1824 # Open the URL
1825 f = downloader.retrieve(url)
1826
1827 with self.db.transaction():
1828 # Drop any previous content
1829 self.db.execute("DELETE FROM autnum_feeds WHERE source = %s", name)
1830 self.db.execute("DELETE FROM network_feeds WHERE source = %s", name)
1831
1832 # Call the callback to process the feed
1833 return callback(name, f, *args)
1834
1835 def _import_aws_ip_ranges(self, name, f):
1836 # Parse the feed
1837 aws_ip_dump = json.load(f)
1838
1839 # XXX: Set up a dictionary for mapping a region name to a country. Unfortunately,
1840 # there seems to be no machine-readable version available of this other than
1841 # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html
1842 # (worse, it seems to be incomplete :-/ ); https://www.cloudping.cloud/endpoints
1843 # was helpful here as well.
1844 aws_region_country_map = {
1845 "af-south-1": "ZA",
1846 "ap-east-1": "HK",
1847 "ap-south-1": "IN",
1848 "ap-south-2": "IN",
1849 "ap-northeast-3": "JP",
1850 "ap-northeast-2": "KR",
1851 "ap-southeast-1": "SG",
1852 "ap-southeast-2": "AU",
1853 "ap-southeast-3": "MY",
1854 "ap-southeast-4": "AU",
1855 "ap-southeast-5": "NZ", # Auckland, NZ
1856 "ap-southeast-6": "AP", # XXX: Precise location not documented anywhere
1857 "ap-northeast-1": "JP",
1858 "ca-central-1": "CA",
1859 "ca-west-1": "CA",
1860 "eu-central-1": "DE",
1861 "eu-central-2": "CH",
1862 "eu-west-1": "IE",
1863 "eu-west-2": "GB",
1864 "eu-south-1": "IT",
1865 "eu-south-2": "ES",
1866 "eu-west-3": "FR",
1867 "eu-north-1": "SE",
1868 "eusc-de-east-1" : "DE", # XXX: Undocumented, likely located in Berlin rather than Frankfurt
1869 "il-central-1": "IL", # XXX: This one is not documented anywhere except for ip-ranges.json itself
1870 "me-central-1": "AE",
1871 "me-south-1": "BH",
1872 "sa-east-1": "BR"
1873 }
1874
1875 # Fetch all valid country codes to check parsed networks aganist...
1876 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
1877 validcountries = []
1878
1879 for row in rows:
1880 validcountries.append(row.country_code)
1881
1882 for snetwork in aws_ip_dump["prefixes"] + aws_ip_dump["ipv6_prefixes"]:
1883 try:
1884 network = ipaddress.ip_network(snetwork.get("ip_prefix") or snetwork.get("ipv6_prefix"), strict=False)
1885 except ValueError:
1886 log.warning("Unable to parse line: %s" % snetwork)
1887 continue
1888
1889 # Sanitize parsed networks...
1890 if not self._check_parsed_network(network):
1891 continue
1892
1893 # Determine region of this network...
1894 region = snetwork["region"]
1895 cc = None
1896 is_anycast = False
1897
1898 # Any region name starting with "us-" will get "US" country code assigned straight away...
1899 if region.startswith("us-"):
1900 cc = "US"
1901 elif region.startswith("cn-"):
1902 # ... same goes for China ...
1903 cc = "CN"
1904 elif region == "GLOBAL":
1905 # ... funny region name for anycast-like networks ...
1906 is_anycast = True
1907 elif region in aws_region_country_map:
1908 # ... assign looked up country code otherwise ...
1909 cc = aws_region_country_map[region]
1910 else:
1911 # ... and bail out if we are missing something here
1912 log.warning("Unable to determine country code for line: %s" % snetwork)
1913 continue
1914
1915 # Skip networks with unknown country codes
1916 if not is_anycast and validcountries and cc not in validcountries:
1917 log.warning("Skipping Amazon AWS network with bogus country '%s': %s" % \
1918 (cc, network))
1919 return
1920
1921 # Conduct SQL statement...
1922 self.db.execute("""
1923 INSERT INTO
1924 network_feeds
1925 (
1926 network,
1927 source,
1928 country,
1929 is_anycast
1930 )
1931 VALUES
1932 (
1933 %s, %s, %s, %s
1934 )
1935 ON CONFLICT (network, source) DO NOTHING
1936 """, "%s" % network, "Amazon AWS IP feed", cc, is_anycast,
1937 )
1938
1939 def _update_feed_for_spamhaus_drop(self):
1940 downloader = location.importer.Downloader()
1941
1942 ip_lists = [
1943 ("SPAMHAUS-DROP", "https://www.spamhaus.org/drop/drop.txt"),
1944 ("SPAMHAUS-EDROP", "https://www.spamhaus.org/drop/edrop.txt"),
1945 ("SPAMHAUS-DROPV6", "https://www.spamhaus.org/drop/dropv6.txt")
1946 ]
1947
1948 asn_lists = [
1949 ("SPAMHAUS-ASNDROP", "https://www.spamhaus.org/drop/asndrop.json")
1950 ]
1951
1952 for name, url in ip_lists:
1953 # Fetch IP list from given URL
1954 f = downloader.retrieve(url)
1955
1956 # Split into lines
1957 fcontent = f.readlines()
1958
1959 with self.db.transaction():
1960 # Conduct a very basic sanity check to rule out CDN issues causing bogus DROP
1961 # downloads.
1962 if len(fcontent) > 10:
1963 self.db.execute("DELETE FROM network_feeds WHERE source = %s", name)
1964 else:
1965 log.warning("%s (%s) returned likely bogus file, ignored" % (name, url))
1966 continue
1967
1968 # Iterate through every line, filter comments and add remaining networks to
1969 # the override table in case they are valid...
1970 for sline in fcontent:
1971 # The response is assumed to be encoded in UTF-8...
1972 sline = sline.decode("utf-8")
1973
1974 # Comments start with a semicolon...
1975 if sline.startswith(";"):
1976 continue
1977
1978 # Extract network and ignore anything afterwards...
1979 try:
1980 network = ipaddress.ip_network(sline.split()[0], strict=False)
1981 except ValueError:
1982 log.error("Unable to parse line: %s" % sline)
1983 continue
1984
1985 # Sanitize parsed networks...
1986 if not self._check_parsed_network(network):
1987 log.warning("Skipping bogus network found in %s (%s): %s" % \
1988 (name, url, network))
1989 continue
1990
1991 # Conduct SQL statement...
1992 self.db.execute("""
1993 INSERT INTO
1994 network_feeds
1995 (
1996 network,
1997 source,
1998 is_drop
1999 )
2000 VALUES
2001 (
2002 %s, %s, %s
2003 )""", "%s" % network, name, True,
2004 )
2005
2006 for name, url in asn_lists:
2007 # Fetch URL
2008 f = downloader.retrieve(url)
2009
2010 # Split into lines
2011 fcontent = f.readlines()
2012
2013 with self.db.transaction():
2014 # Conduct a very basic sanity check to rule out CDN issues causing bogus DROP
2015 # downloads.
2016 if len(fcontent) > 10:
2017 self.db.execute("DELETE FROM autnum_feeds WHERE source = %s", name)
2018 else:
2019 log.warning("%s (%s) returned likely bogus file, ignored" % (name, url))
2020 continue
2021
2022 # Iterate through every line, filter comments and add remaining ASNs to
2023 # the override table in case they are valid...
2024 for sline in fcontent:
2025 # The response is assumed to be encoded in UTF-8...
2026 sline = sline.decode("utf-8")
2027
2028 # Load every line as a JSON object and try to obtain an ASN from it...
2029 try:
2030 lineobj = json.loads(sline)
2031 except json.decoder.JSONDecodeError:
2032 log.error("Unable to parse line as a JSON object: %s" % sline)
2033 continue
2034
2035 # Skip line contiaining file metadata
2036 try:
2037 type = lineobj["type"]
2038
2039 if type == "metadata":
2040 continue
2041 except KeyError:
2042 pass
2043
2044 try:
2045 asn = lineobj["asn"]
2046 as_name = lineobj["asname"]
2047 except KeyError:
2048 log.warning("Unable to extract necessary information from line: %s" % sline)
2049 continue
2050
2051 # Filter invalid ASNs...
2052 if not self._check_parsed_asn(asn):
2053 log.warning("Skipping bogus ASN found in %s (%s): %s" % \
2054 (name, url, asn))
2055 continue
2056
2057 # Conduct SQL statement...
2058 self.db.execute("""
2059 INSERT INTO
2060 autnum_feeds
2061 (
2062 number,
2063 source,
2064 is_drop
2065 )
2066 VALUES
2067 (
2068 %s, %s, %s
2069 )""", "%s" % asn, name, True,
2070 )
2071
2072 @staticmethod
2073 def _parse_bool(block, key):
2074 val = block.get(key)
2075
2076 # There is no point to proceed when we got None
2077 if val is None:
2078 return
2079
2080 # Convert to lowercase
2081 val = val.lower()
2082
2083 # True
2084 if val in ("yes", "1"):
2085 return True
2086
2087 # False
2088 if val in ("no", "0"):
2089 return False
2090
2091 # Default to None
2092 return None
2093
2094 @property
2095 def countries(self):
2096 # Fetch all valid country codes to check parsed networks aganist
2097 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
2098
2099 # Return all countries
2100 return [row.country_code for row in rows]
2101
2102 def handle_import_countries(self, ns):
2103 with self.db.transaction():
2104 # Drop all data that we have
2105 self.db.execute("TRUNCATE TABLE countries")
2106
2107 for file in ns.file:
2108 for line in file:
2109 line = line.rstrip()
2110
2111 # Ignore any comments
2112 if line.startswith("#"):
2113 continue
2114
2115 try:
2116 country_code, continent_code, name = line.split(maxsplit=2)
2117 except:
2118 log.warning("Could not parse line: %s" % line)
2119 continue
2120
2121 self.db.execute("INSERT INTO countries(country_code, name, continent_code) \
2122 VALUES(%s, %s, %s) ON CONFLICT DO NOTHING", country_code, name, continent_code)
2123
2124
2125 def split_line(line):
2126 key, colon, val = line.partition(":")
2127
2128 # Strip any excess space
2129 key = key.strip()
2130 val = val.strip()
2131
2132 return key, val
2133
2134 def main():
2135 # Run the command line interface
2136 c = CLI()
2137 c.run()
2138
2139 main()