]> git.ipfire.org Git - location/libloc.git/blob - src/scripts/location-importer.in
importer: Import AWS IP feed into the new feeds table
[location/libloc.git] / src / scripts / location-importer.in
1 #!/usr/bin/python3
2 ###############################################################################
3 # #
4 # libloc - A library to determine the location of someone on the Internet #
5 # #
6 # Copyright (C) 2020-2024 IPFire Development Team <info@ipfire.org> #
7 # #
8 # This library is free software; you can redistribute it and/or #
9 # modify it under the terms of the GNU Lesser General Public #
10 # License as published by the Free Software Foundation; either #
11 # version 2.1 of the License, or (at your option) any later version. #
12 # #
13 # This library is distributed in the hope that it will be useful, #
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
16 # Lesser General Public License for more details. #
17 # #
18 ###############################################################################
19
20 import argparse
21 import concurrent.futures
22 import csv
23 import http.client
24 import ipaddress
25 import json
26 import logging
27 import math
28 import re
29 import socket
30 import sys
31 import urllib.error
32
33 # Load our location module
34 import location
35 import location.database
36 import location.importer
37 from location.i18n import _
38
39 # Initialise logging
40 log = logging.getLogger("location.importer")
41 log.propagate = 1
42
43 # Define constants
44 VALID_ASN_RANGES = (
45 (1, 23455),
46 (23457, 64495),
47 (131072, 4199999999),
48 )
49
50 # Configure the CSV parser for ARIN
51 csv.register_dialect("arin", delimiter=",", quoting=csv.QUOTE_ALL, quotechar="\"")
52
53 class CLI(object):
54 def parse_cli(self):
55 parser = argparse.ArgumentParser(
56 description=_("Location Importer Command Line Interface"),
57 )
58 subparsers = parser.add_subparsers()
59
60 # Global configuration flags
61 parser.add_argument("--debug", action="store_true",
62 help=_("Enable debug output"))
63 parser.add_argument("--quiet", action="store_true",
64 help=_("Enable quiet mode"))
65
66 # version
67 parser.add_argument("--version", action="version",
68 version="%(prog)s @VERSION@")
69
70 # Database
71 parser.add_argument("--database-host", required=True,
72 help=_("Database Hostname"), metavar=_("HOST"))
73 parser.add_argument("--database-name", required=True,
74 help=_("Database Name"), metavar=_("NAME"))
75 parser.add_argument("--database-username", required=True,
76 help=_("Database Username"), metavar=_("USERNAME"))
77 parser.add_argument("--database-password", required=True,
78 help=_("Database Password"), metavar=_("PASSWORD"))
79
80 # Write Database
81 write = subparsers.add_parser("write", help=_("Write database to file"))
82 write.set_defaults(func=self.handle_write)
83 write.add_argument("file", nargs=1, help=_("Database File"))
84 write.add_argument("--signing-key", nargs="?", type=open, help=_("Signing Key"))
85 write.add_argument("--backup-signing-key", nargs="?", type=open, help=_("Backup Signing Key"))
86 write.add_argument("--vendor", nargs="?", help=_("Sets the vendor"))
87 write.add_argument("--description", nargs="?", help=_("Sets a description"))
88 write.add_argument("--license", nargs="?", help=_("Sets the license"))
89 write.add_argument("--version", type=int, help=_("Database Format Version"))
90
91 # Update WHOIS
92 update_whois = subparsers.add_parser("update-whois", help=_("Update WHOIS Information"))
93 update_whois.set_defaults(func=self.handle_update_whois)
94
95 # Update announcements
96 update_announcements = subparsers.add_parser("update-announcements",
97 help=_("Update BGP Annoucements"))
98 update_announcements.set_defaults(func=self.handle_update_announcements)
99 update_announcements.add_argument("server", nargs=1,
100 help=_("Route Server to connect to"), metavar=_("SERVER"))
101
102 # Update geofeeds
103 update_geofeeds = subparsers.add_parser("update-geofeeds",
104 help=_("Update Geofeeds"))
105 update_geofeeds.set_defaults(func=self.handle_update_geofeeds)
106
107 # Update overrides
108 update_overrides = subparsers.add_parser("update-overrides",
109 help=_("Update overrides"),
110 )
111 update_overrides.add_argument(
112 "files", nargs="+", help=_("Files to import"),
113 )
114 update_overrides.set_defaults(func=self.handle_update_overrides)
115
116 # Import countries
117 import_countries = subparsers.add_parser("import-countries",
118 help=_("Import countries"),
119 )
120 import_countries.add_argument("file", nargs=1, type=argparse.FileType("r"),
121 help=_("File to import"))
122 import_countries.set_defaults(func=self.handle_import_countries)
123
124 args = parser.parse_args()
125
126 # Configure logging
127 if args.debug:
128 location.logger.set_level(logging.DEBUG)
129 elif args.quiet:
130 location.logger.set_level(logging.WARNING)
131
132 # Print usage if no action was given
133 if not "func" in args:
134 parser.print_usage()
135 sys.exit(2)
136
137 return args
138
139 def run(self):
140 # Parse command line arguments
141 args = self.parse_cli()
142
143 # Initialise database
144 self.db = self._setup_database(args)
145
146 # Call function
147 ret = args.func(args)
148
149 # Return with exit code
150 if ret:
151 sys.exit(ret)
152
153 # Otherwise just exit
154 sys.exit(0)
155
156 def _setup_database(self, ns):
157 """
158 Initialise the database
159 """
160 # Connect to database
161 db = location.database.Connection(
162 host=ns.database_host, database=ns.database_name,
163 user=ns.database_username, password=ns.database_password,
164 )
165
166 with db.transaction():
167 db.execute("""
168 -- announcements
169 CREATE TABLE IF NOT EXISTS announcements(network inet, autnum bigint,
170 first_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
171 last_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP);
172 CREATE UNIQUE INDEX IF NOT EXISTS announcements_networks ON announcements(network);
173 CREATE INDEX IF NOT EXISTS announcements_family ON announcements(family(network));
174 CREATE INDEX IF NOT EXISTS announcements_search ON announcements USING GIST(network inet_ops);
175
176 -- autnums
177 CREATE TABLE IF NOT EXISTS autnums(number bigint, name text NOT NULL);
178 ALTER TABLE autnums ADD COLUMN IF NOT EXISTS source text;
179 CREATE UNIQUE INDEX IF NOT EXISTS autnums_number ON autnums(number);
180
181 -- countries
182 CREATE TABLE IF NOT EXISTS countries(
183 country_code text NOT NULL, name text NOT NULL, continent_code text NOT NULL);
184 CREATE UNIQUE INDEX IF NOT EXISTS countries_country_code ON countries(country_code);
185
186 -- networks
187 CREATE TABLE IF NOT EXISTS networks(network inet, country text);
188 ALTER TABLE networks ADD COLUMN IF NOT EXISTS original_countries text[];
189 ALTER TABLE networks ADD COLUMN IF NOT EXISTS source text;
190 CREATE UNIQUE INDEX IF NOT EXISTS networks_network ON networks(network);
191 CREATE INDEX IF NOT EXISTS networks_family ON networks USING BTREE(family(network));
192 CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
193
194 -- geofeeds
195 CREATE TABLE IF NOT EXISTS geofeeds(
196 id serial primary key,
197 url text,
198 status integer default null,
199 updated_at timestamp without time zone default null
200 );
201 ALTER TABLE geofeeds ADD COLUMN IF NOT EXISTS error text;
202 CREATE UNIQUE INDEX IF NOT EXISTS geofeeds_unique
203 ON geofeeds(url);
204 CREATE TABLE IF NOT EXISTS geofeed_networks(
205 geofeed_id integer references geofeeds(id) on delete cascade,
206 network inet,
207 country text,
208 region text,
209 city text
210 );
211 CREATE INDEX IF NOT EXISTS geofeed_networks_geofeed_id
212 ON geofeed_networks(geofeed_id);
213 CREATE INDEX IF NOT EXISTS geofeed_networks_search
214 ON geofeed_networks USING GIST(network inet_ops);
215 CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text);
216 CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique
217 ON network_geofeeds(network);
218 CREATE INDEX IF NOT EXISTS network_geofeeds_search
219 ON network_geofeeds USING GIST(network inet_ops);
220 CREATE INDEX IF NOT EXISTS network_geofeeds_url
221 ON network_geofeeds(url);
222
223 -- feeds
224 CREATE TABLE IF NOT EXISTS autnum_feeds(
225 number bigint NOT NULL,
226 source text NOT NULL,
227 name text,
228 country text,
229 is_anonymous_proxy boolean,
230 is_satellite_provider boolean,
231 is_anycast boolean,
232 is_drop boolean
233 );
234 CREATE UNIQUE INDEX IF NOT EXISTS autnum_feeds_unique
235 ON autnum_feeds(number, source);
236
237 CREATE TABLE IF NOT EXISTS network_feeds(
238 network inet NOT NULL,
239 source text NOT NULL,
240 country text,
241 is_anonymous_proxy boolean,
242 is_satellite_provider boolean,
243 is_anycast boolean,
244 is_drop boolean
245 );
246 CREATE UNIQUE INDEX IF NOT EXISTS network_feeds_unique
247 ON network_feeds(network, source);
248 CREATE INDEX IF NOT EXISTS network_feeds_search
249 ON network_feeds USING GIST(network inet_ops);
250
251 -- overrides
252 CREATE TABLE IF NOT EXISTS autnum_overrides(
253 number bigint NOT NULL,
254 name text,
255 country text,
256 is_anonymous_proxy boolean,
257 is_satellite_provider boolean,
258 is_anycast boolean
259 );
260 CREATE UNIQUE INDEX IF NOT EXISTS autnum_overrides_number
261 ON autnum_overrides(number);
262 ALTER TABLE autnum_overrides ADD COLUMN IF NOT EXISTS source text;
263 ALTER TABLE autnum_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
264
265 CREATE TABLE IF NOT EXISTS network_overrides(
266 network inet NOT NULL,
267 country text,
268 is_anonymous_proxy boolean,
269 is_satellite_provider boolean,
270 is_anycast boolean
271 );
272 CREATE UNIQUE INDEX IF NOT EXISTS network_overrides_network
273 ON network_overrides(network);
274 CREATE INDEX IF NOT EXISTS network_overrides_search
275 ON network_overrides USING GIST(network inet_ops);
276 ALTER TABLE network_overrides ADD COLUMN IF NOT EXISTS source text;
277 ALTER TABLE network_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
278 """)
279
280 return db
281
282 def handle_write(self, ns):
283 """
284 Compiles a database in libloc format out of what is in the database
285 """
286 # Allocate a writer
287 writer = location.Writer(ns.signing_key, ns.backup_signing_key)
288
289 # Set all metadata
290 if ns.vendor:
291 writer.vendor = ns.vendor
292
293 if ns.description:
294 writer.description = ns.description
295
296 if ns.license:
297 writer.license = ns.license
298
299 # Add all Autonomous Systems
300 log.info("Writing Autonomous Systems...")
301
302 # Select all ASes with a name
303 rows = self.db.query("""
304 SELECT
305 autnums.number AS number,
306 COALESCE(
307 overrides.name,
308 autnums.name
309 ) AS name
310 FROM
311 autnums
312 LEFT JOIN
313 autnum_overrides overrides ON autnums.number = overrides.number
314 ORDER BY
315 autnums.number
316 """)
317
318 for row in rows:
319 # Skip AS without names
320 if not row.name:
321 continue
322
323 a = writer.add_as(row.number)
324 a.name = row.name
325
326 # Add all networks
327 log.info("Writing networks...")
328
329 # Select all known networks
330 rows = self.db.query("""
331 WITH known_networks AS (
332 SELECT network FROM announcements
333 UNION
334 SELECT network FROM networks
335 UNION
336 SEELCT network FROM network_feeds
337 UNION
338 SELECT network FROM network_overrides
339 UNION
340 SELECT network FROM geofeed_networks
341 ),
342
343 ordered_networks AS (
344 SELECT
345 known_networks.network AS network,
346 announcements.autnum AS autnum,
347 networks.country AS country,
348
349 -- Must be part of returned values for ORDER BY clause
350 masklen(announcements.network) AS sort_a,
351 masklen(networks.network) AS sort_b
352 FROM
353 known_networks
354 LEFT JOIN
355 announcements ON known_networks.network <<= announcements.network
356 LEFT JOIN
357 networks ON known_networks.network <<= networks.network
358 ORDER BY
359 known_networks.network,
360 sort_a DESC,
361 sort_b DESC
362 )
363
364 -- Return a list of those networks enriched with all
365 -- other information that we store in the database
366 SELECT
367 DISTINCT ON (network)
368 network,
369 autnum,
370
371 -- Country
372 COALESCE(
373 (
374 SELECT country FROM network_overrides overrides
375 WHERE networks.network <<= overrides.network
376 ORDER BY masklen(overrides.network) DESC
377 LIMIT 1
378 ),
379 (
380 SELECT country FROM autnum_overrides overrides
381 WHERE networks.autnum = overrides.number
382 ),
383 (
384 SELECT country FROM network_feeds feeds
385 WHERE networks.network <<= feeds.network
386 ORDER BY masklen(feeds.network) DESC
387 LIMIT 1
388 ),
389 (
390 SELECT country FROM autnum_feeds feeds
391 WHERE networks.autnum = feeds.number
392 ORDER BY source LIMIT 1
393 ),
394 (
395 SELECT
396 geofeed_networks.country AS country
397 FROM
398 network_geofeeds
399
400 -- Join the data from the geofeeds
401 LEFT JOIN
402 geofeeds ON network_geofeeds.url = geofeeds.url
403 LEFT JOIN
404 geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id
405
406 -- Check whether we have a geofeed for this network
407 WHERE
408 networks.network <<= network_geofeeds.network
409 AND
410 networks.network <<= geofeed_networks.network
411
412 -- Filter for the best result
413 ORDER BY
414 masklen(geofeed_networks.network) DESC
415 LIMIT 1
416 ),
417 networks.country
418 ) AS country,
419
420 -- Flags
421 COALESCE(
422 (
423 SELECT is_anonymous_proxy FROM network_overrides overrides
424 WHERE networks.network <<= overrides.network
425 ORDER BY masklen(overrides.network) DESC
426 LIMIT 1
427 ),
428 (
429 SELECT is_anonymous_proxy FROM network_feeds feeds
430 WHERE networks.network <<= feeds.network
431 ORDER BY masklen(feeds.network) DESC
432 LIMIT 1
433 ),
434 (
435 SELECT is_anonymous_proxy FROM autnum_feeds feeds
436 WHERE networks.autnum = feeds.number
437 ORDER BY source LIMIT 1
438 ),
439 (
440 SELECT is_anonymous_proxy FROM autnum_overrides overrides
441 WHERE networks.autnum = overrides.number
442 ),
443 FALSE
444 ) AS is_anonymous_proxy,
445 COALESCE(
446 (
447 SELECT is_satellite_provider FROM network_overrides overrides
448 WHERE networks.network <<= overrides.network
449 ORDER BY masklen(overrides.network) DESC
450 LIMIT 1
451 ),
452 (
453 SELECT is_satellite_provider FROM network_feeds feeds
454 WHERE networks.network <<= feeds.network
455 ORDER BY masklen(feeds.network) DESC
456 LIMIT 1
457 ),
458 (
459 SELECT is_satellite_provider FROM autnum_feeds feeds
460 WHERE networks.autnum = feeds.number
461 ORDER BY source LIMIT 1
462 ),
463 (
464 SELECT is_satellite_provider FROM autnum_overrides overrides
465 WHERE networks.autnum = overrides.number
466 ),
467 FALSE
468 ) AS is_satellite_provider,
469 COALESCE(
470 (
471 SELECT is_anycast FROM network_overrides overrides
472 WHERE networks.network <<= overrides.network
473 ORDER BY masklen(overrides.network) DESC
474 LIMIT 1
475 ),
476 (
477 SELECT is_anycast FROM network_feeds feeds
478 WHERE networks.network <<= feeds.network
479 ORDER BY masklen(feeds.network) DESC
480 LIMIT 1
481 ),
482 (
483 SELECT is_anycast FROM autnum_feeds feeds
484 WHERE networks.autnum = feeds.number
485 ORDER BY source LIMIT 1
486 ),
487 (
488 SELECT is_anycast FROM autnum_overrides overrides
489 WHERE networks.autnum = overrides.number
490 ),
491 FALSE
492 ) AS is_anycast,
493 COALESCE(
494 (
495 SELECT is_drop FROM network_overrides overrides
496 WHERE networks.network <<= overrides.network
497 ORDER BY masklen(overrides.network) DESC
498 LIMIT 1
499 ),
500 (
501 SELECT is_drop FROM network_feeds feeds
502 WHERE networks.network <<= feeds.network
503 ORDER BY masklen(feeds.network) DESC
504 LIMIT 1
505 ),
506 (
507 SELECT is_drop FROM autnum_feeds feeds
508 WHERE networks.autnum = feeds.number
509 ORDER BY source LIMIT 1
510 ),
511 (
512 SELECT is_drop FROM autnum_overrides overrides
513 WHERE networks.autnum = overrides.number
514 ),
515 FALSE
516 ) AS is_drop
517 FROM
518 ordered_networks networks
519 """)
520
521 for row in rows:
522 network = writer.add_network(row.network)
523
524 # Save country
525 if row.country:
526 network.country_code = row.country
527
528 # Save ASN
529 if row.autnum:
530 network.asn = row.autnum
531
532 # Set flags
533 if row.is_anonymous_proxy:
534 network.set_flag(location.NETWORK_FLAG_ANONYMOUS_PROXY)
535
536 if row.is_satellite_provider:
537 network.set_flag(location.NETWORK_FLAG_SATELLITE_PROVIDER)
538
539 if row.is_anycast:
540 network.set_flag(location.NETWORK_FLAG_ANYCAST)
541
542 if row.is_drop:
543 network.set_flag(location.NETWORK_FLAG_DROP)
544
545 # Add all countries
546 log.info("Writing countries...")
547 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
548
549 for row in rows:
550 c = writer.add_country(row.country_code)
551 c.continent_code = row.continent_code
552 c.name = row.name
553
554 # Write everything to file
555 log.info("Writing database to file...")
556 for file in ns.file:
557 writer.write(file)
558
559 def handle_update_whois(self, ns):
560 downloader = location.importer.Downloader()
561
562 # Did we run successfully?
563 error = False
564
565 # Fetch all valid country codes to check parsed networks against
566 validcountries = self.countries
567
568 # Iterate over all potential sources
569 for source in sorted(location.importer.SOURCES):
570 with self.db.transaction():
571 # Create some temporary tables to store parsed data
572 self.db.execute("""
573 CREATE TEMPORARY TABLE _autnums(number integer NOT NULL,
574 organization text NOT NULL, source text NOT NULL) ON COMMIT DROP;
575 CREATE UNIQUE INDEX _autnums_number ON _autnums(number);
576
577 CREATE TEMPORARY TABLE _organizations(handle text NOT NULL,
578 name text NOT NULL, source text NOT NULL) ON COMMIT DROP;
579 CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle);
580
581 CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text NOT NULL,
582 original_countries text[] NOT NULL, source text NOT NULL)
583 ON COMMIT DROP;
584 CREATE INDEX _rirdata_search ON _rirdata
585 USING BTREE(family(network), masklen(network));
586 CREATE UNIQUE INDEX _rirdata_network ON _rirdata(network);
587 """)
588
589 # Remove all previously imported content
590 self.db.execute("DELETE FROM autnums WHERE source = %s", source)
591 self.db.execute("DELETE FROM networks WHERE source = %s", source)
592
593 try:
594 # Fetch WHOIS sources
595 for url in location.importer.WHOIS_SOURCES.get(source, []):
596 for block in downloader.request_blocks(url):
597 self._parse_block(block, source, validcountries)
598
599 # Fetch extended sources
600 for url in location.importer.EXTENDED_SOURCES.get(source, []):
601 for line in downloader.request_lines(url):
602 self._parse_line(line, source, validcountries)
603 except urllib.error.URLError as e:
604 log.error("Could not retrieve data from %s: %s" % (source, e))
605 error = True
606
607 # Continue with the next source
608 continue
609
610 # Process all parsed networks from every RIR we happen to have access to,
611 # insert the largest network chunks into the networks table immediately...
612 families = self.db.query("""
613 SELECT DISTINCT
614 family(network) AS family
615 FROM
616 _rirdata
617 ORDER BY
618 family(network)
619 """,
620 )
621
622 for family in (row.family for row in families):
623 # Fetch the smallest mask length in our data set
624 smallest = self.db.get("""
625 SELECT
626 MIN(
627 masklen(network)
628 ) AS prefix
629 FROM
630 _rirdata
631 WHERE
632 family(network) = %s
633 """, family,
634 )
635
636 # Copy all networks
637 self.db.execute("""
638 INSERT INTO
639 networks
640 (
641 network,
642 country,
643 original_countries,
644 source
645 )
646 SELECT
647 network,
648 country,
649 original_countries,
650 source
651 FROM
652 _rirdata
653 WHERE
654 masklen(network) = %s
655 AND
656 family(network) = %s
657 ON CONFLICT DO
658 NOTHING""",
659 smallest.prefix,
660 family,
661 )
662
663 # ... determine any other prefixes for this network family, ...
664 prefixes = self.db.query("""
665 SELECT
666 DISTINCT masklen(network) AS prefix
667 FROM
668 _rirdata
669 WHERE
670 family(network) = %s
671 ORDER BY
672 masklen(network) ASC
673 OFFSET 1
674 """, family,
675 )
676
677 # ... and insert networks with this prefix in case they provide additional
678 # information (i. e. subnet of a larger chunk with a different country)
679 for prefix in (row.prefix for row in prefixes):
680 self.db.execute("""
681 WITH candidates AS (
682 SELECT
683 _rirdata.network,
684 _rirdata.country,
685 _rirdata.original_countries,
686 _rirdata.source
687 FROM
688 _rirdata
689 WHERE
690 family(_rirdata.network) = %s
691 AND
692 masklen(_rirdata.network) = %s
693 ),
694 filtered AS (
695 SELECT
696 DISTINCT ON (c.network)
697 c.network,
698 c.country,
699 c.original_countries,
700 c.source,
701 masklen(networks.network),
702 networks.country AS parent_country
703 FROM
704 candidates c
705 LEFT JOIN
706 networks
707 ON
708 c.network << networks.network
709 ORDER BY
710 c.network,
711 masklen(networks.network) DESC NULLS LAST
712 )
713 INSERT INTO
714 networks(network, country, original_countries, source)
715 SELECT
716 network,
717 country,
718 original_countries,
719 source
720 FROM
721 filtered
722 WHERE
723 parent_country IS NULL
724 OR
725 country <> parent_country
726 ON CONFLICT DO NOTHING
727 """, family, prefix,
728 )
729
730 self.db.execute("""
731 INSERT INTO
732 autnums
733 (
734 number,
735 name,
736 source
737 )
738 SELECT
739 _autnums.number,
740 _organizations.name,
741 _organizations.source
742 FROM
743 _autnums
744 JOIN
745 _organizations ON _autnums.organization = _organizations.handle
746 ON CONFLICT
747 (
748 number
749 )
750 DO UPDATE
751 SET name = excluded.name
752 """,
753 )
754
755 # Download and import (technical) AS names from ARIN
756 with self.db.transaction():
757 self._import_as_names_from_arin(downloader)
758
759 # Return a non-zero exit code for errors
760 return 1 if error else 0
761
762 def _check_parsed_network(self, network):
763 """
764 Assistive function to detect and subsequently sort out parsed
765 networks from RIR data (both Whois and so-called "extended sources"),
766 which are or have...
767
768 (a) not globally routable (RFC 1918 space, et al.)
769 (b) covering a too large chunk of the IP address space (prefix length
770 is < 7 for IPv4 networks, and < 10 for IPv6)
771 (c) "0.0.0.0" or "::" as a network address
772 (d) are too small for being publicly announced (we have decided not to
773 process them at the moment, as they significantly enlarge our
774 database without providing very helpful additional information)
775
776 This unfortunately is necessary due to brain-dead clutter across
777 various RIR databases, causing mismatches and eventually disruptions.
778
779 We will return False in case a network is not suitable for adding
780 it to our database, and True otherwise.
781 """
782
783 if not network or not (isinstance(network, ipaddress.IPv4Network) or isinstance(network, ipaddress.IPv6Network)):
784 return False
785
786 if not network.is_global:
787 log.debug("Skipping non-globally routable network: %s" % network)
788 return False
789
790 if network.version == 4:
791 if network.prefixlen < 7:
792 log.debug("Skipping too big IP chunk: %s" % network)
793 return False
794
795 if network.prefixlen > 24:
796 log.debug("Skipping network too small to be publicly announced: %s" % network)
797 return False
798
799 if str(network.network_address) == "0.0.0.0":
800 log.debug("Skipping network based on 0.0.0.0: %s" % network)
801 return False
802
803 elif network.version == 6:
804 if network.prefixlen < 10:
805 log.debug("Skipping too big IP chunk: %s" % network)
806 return False
807
808 if network.prefixlen > 48:
809 log.debug("Skipping network too small to be publicly announced: %s" % network)
810 return False
811
812 if str(network.network_address) == "::":
813 log.debug("Skipping network based on '::': %s" % network)
814 return False
815
816 else:
817 # This should not happen...
818 log.warning("Skipping network of unknown family, this should not happen: %s" % network)
819 return False
820
821 # In case we have made it here, the network is considered to
822 # be suitable for libloc consumption...
823 return True
824
825 def _check_parsed_asn(self, asn):
826 """
827 Assistive function to filter Autonomous System Numbers not being suitable
828 for adding to our database. Returns False in such cases, and True otherwise.
829 """
830
831 for start, end in VALID_ASN_RANGES:
832 if start <= asn and end >= asn:
833 return True
834
835 log.info("Supplied ASN %s out of publicly routable ASN ranges" % asn)
836 return False
837
838 def _parse_block(self, block, source_key, validcountries = None):
839 # Get first line to find out what type of block this is
840 line = block[0]
841
842 # aut-num
843 if line.startswith("aut-num:"):
844 return self._parse_autnum_block(block, source_key)
845
846 # inetnum
847 if line.startswith("inet6num:") or line.startswith("inetnum:"):
848 return self._parse_inetnum_block(block, source_key, validcountries)
849
850 # organisation
851 elif line.startswith("organisation:"):
852 return self._parse_org_block(block, source_key)
853
854 def _parse_autnum_block(self, block, source_key):
855 autnum = {}
856 for line in block:
857 # Split line
858 key, val = split_line(line)
859
860 if key == "aut-num":
861 m = re.match(r"^(AS|as)(\d+)", val)
862 if m:
863 autnum["asn"] = m.group(2)
864
865 elif key == "org":
866 autnum[key] = val.upper()
867
868 elif key == "descr":
869 # Save the first description line as well...
870 if not key in autnum:
871 autnum[key] = val
872
873 # Skip empty objects
874 if not autnum or not "asn" in autnum:
875 return
876
877 # Insert a dummy organisation handle into our temporary organisations
878 # table in case the AS does not have an organisation handle set, but
879 # has a description (a quirk often observed in APNIC area), so we can
880 # later display at least some string for this AS.
881 if not "org" in autnum:
882 if "descr" in autnum:
883 autnum["org"] = "LIBLOC-%s-ORGHANDLE" % autnum.get("asn")
884
885 self.db.execute("INSERT INTO _organizations(handle, name, source) \
886 VALUES(%s, %s, %s) ON CONFLICT (handle) DO NOTHING",
887 autnum.get("org"), autnum.get("descr"), source_key,
888 )
889 else:
890 log.warning("ASN %s neither has an organisation handle nor a description line set, omitting" % \
891 autnum.get("asn"))
892 return
893
894 # Insert into database
895 self.db.execute("INSERT INTO _autnums(number, organization, source) \
896 VALUES(%s, %s, %s) ON CONFLICT (number) DO UPDATE SET \
897 organization = excluded.organization",
898 autnum.get("asn"), autnum.get("org"), source_key,
899 )
900
901 def _parse_inetnum_block(self, block, source_key, validcountries = None):
902 log.debug("Parsing inetnum block:")
903
904 inetnum = {}
905 for line in block:
906 log.debug(line)
907
908 # Split line
909 key, val = split_line(line)
910
911 # Filter any inetnum records which are only referring to IP space
912 # not managed by that specific RIR...
913 if key == "netname":
914 if re.match(r"^(ERX-NETBLOCK|(AFRINIC|ARIN|LACNIC|RIPE)-CIDR-BLOCK|IANA-NETBLOCK-\d{1,3}|NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK|STUB-[\d-]{3,}SLASH\d{1,2})", val.strip()):
915 log.debug("Skipping record indicating historic/orphaned data: %s" % val.strip())
916 return
917
918 if key == "inetnum":
919 start_address, delim, end_address = val.partition("-")
920
921 # Strip any excess space
922 start_address, end_address = start_address.rstrip(), end_address.strip()
923
924 # Handle "inetnum" formatting in LACNIC DB (e.g. "24.152.8/22" instead of "24.152.8.0/22")
925 if start_address and not (delim or end_address):
926 try:
927 start_address = ipaddress.ip_network(start_address, strict=False)
928 except ValueError:
929 start_address = start_address.split("/")
930 ldigits = start_address[0].count(".")
931
932 # How many octets do we need to add?
933 # (LACNIC does not seem to have a /8 or greater assigned, so the following should suffice.)
934 if ldigits == 1:
935 start_address = start_address[0] + ".0.0/" + start_address[1]
936 elif ldigits == 2:
937 start_address = start_address[0] + ".0/" + start_address[1]
938 else:
939 log.warning("Could not recover IPv4 address from line in LACNIC DB format: %s" % line)
940 return
941
942 try:
943 start_address = ipaddress.ip_network(start_address, strict=False)
944 except ValueError:
945 log.warning("Could not parse line in LACNIC DB format: %s" % line)
946 return
947
948 # Enumerate first and last IP address of this network
949 end_address = start_address[-1]
950 start_address = start_address[0]
951
952 else:
953 # Convert to IP address
954 try:
955 start_address = ipaddress.ip_address(start_address)
956 end_address = ipaddress.ip_address(end_address)
957 except ValueError:
958 log.warning("Could not parse line: %s" % line)
959 return
960
961 inetnum["inetnum"] = list(ipaddress.summarize_address_range(start_address, end_address))
962
963 elif key == "inet6num":
964 inetnum[key] = [ipaddress.ip_network(val, strict=False)]
965
966 elif key == "country":
967 val = val.upper()
968
969 # Catch RIR data objects with more than one country code...
970 if not key in inetnum:
971 inetnum[key] = []
972 else:
973 if val in inetnum.get("country"):
974 # ... but keep this list distinct...
975 continue
976
977 # When people set country codes to "UK", they actually mean "GB"
978 if val == "UK":
979 val = "GB"
980
981 inetnum[key].append(val)
982
983 # Parse the geofeed attribute
984 elif key == "geofeed":
985 inetnum["geofeed"] = val
986
987 # Parse geofeed when used as a remark
988 elif key == "remarks":
989 m = re.match(r"^(?:Geofeed)\s+(https://.*)", val)
990 if m:
991 inetnum["geofeed"] = m.group(1)
992
993 # Skip empty objects
994 if not inetnum or not "country" in inetnum:
995 return
996
997 # Prepare skipping objects with unknown country codes...
998 invalidcountries = [singlecountry for singlecountry in inetnum.get("country") if singlecountry not in validcountries]
999
1000 # Iterate through all networks enumerated from above, check them for plausibility and insert
1001 # them into the database, if _check_parsed_network() succeeded
1002 for single_network in inetnum.get("inet6num") or inetnum.get("inetnum"):
1003 if self._check_parsed_network(single_network):
1004 # Skip objects with unknown country codes if they are valid to avoid log spam...
1005 if validcountries and invalidcountries:
1006 log.warning("Skipping network with bogus countr(y|ies) %s (original countries: %s): %s" % \
1007 (invalidcountries, inetnum.get("country"), inetnum.get("inet6num") or inetnum.get("inetnum")))
1008 break
1009
1010 # Everything is fine here, run INSERT statement...
1011 self.db.execute("INSERT INTO _rirdata(network, country, original_countries, source) \
1012 VALUES(%s, %s, %s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
1013 "%s" % single_network, inetnum.get("country")[0], inetnum.get("country"), source_key,
1014 )
1015
1016 # Update any geofeed information
1017 geofeed = inetnum.get("geofeed", None)
1018 if geofeed:
1019 self._parse_geofeed(geofeed, single_network)
1020
1021 # Delete any previous geofeeds
1022 else:
1023 self.db.execute("DELETE FROM network_geofeeds WHERE network = %s",
1024 "%s" % single_network)
1025
1026 def _parse_geofeed(self, url, single_network):
1027 # Parse the URL
1028 url = urllib.parse.urlparse(url)
1029
1030 # Make sure that this is a HTTPS URL
1031 if not url.scheme == "https":
1032 log.debug("Geofeed URL is not using HTTPS: %s" % geofeed)
1033 return
1034
1035 # Put the URL back together normalized
1036 url = url.geturl()
1037
1038 # Store/update any geofeeds
1039 self.db.execute("""
1040 INSERT INTO
1041 network_geofeeds(
1042 network,
1043 url
1044 )
1045 VALUES(
1046 %s, %s
1047 )
1048 ON CONFLICT (network) DO
1049 UPDATE SET url = excluded.url""",
1050 "%s" % single_network, url,
1051 )
1052
1053 def _parse_org_block(self, block, source_key):
1054 org = {}
1055 for line in block:
1056 # Split line
1057 key, val = split_line(line)
1058
1059 if key == "organisation":
1060 org[key] = val.upper()
1061 elif key == "org-name":
1062 org[key] = val
1063
1064 # Skip empty objects
1065 if not org:
1066 return
1067
1068 self.db.execute("INSERT INTO _organizations(handle, name, source) \
1069 VALUES(%s, %s, %s) ON CONFLICT (handle) DO \
1070 UPDATE SET name = excluded.name",
1071 org.get("organisation"), org.get("org-name"), source_key,
1072 )
1073
1074 def _parse_line(self, line, source_key, validcountries = None):
1075 # Skip version line
1076 if line.startswith("2"):
1077 return
1078
1079 # Skip comments
1080 if line.startswith("#"):
1081 return
1082
1083 try:
1084 registry, country_code, type, line = line.split("|", 3)
1085 except:
1086 log.warning("Could not parse line: %s" % line)
1087 return
1088
1089 # Skip any lines that are for stats only or do not have a country
1090 # code at all (avoids log spam below)
1091 if not country_code or country_code == '*':
1092 return
1093
1094 # Skip objects with unknown country codes
1095 if validcountries and country_code not in validcountries:
1096 log.warning("Skipping line with bogus country '%s': %s" % \
1097 (country_code, line))
1098 return
1099
1100 if type in ("ipv6", "ipv4"):
1101 return self._parse_ip_line(country_code, type, line, source_key)
1102
1103 def _parse_ip_line(self, country, type, line, source_key):
1104 try:
1105 address, prefix, date, status, organization = line.split("|")
1106 except ValueError:
1107 organization = None
1108
1109 # Try parsing the line without organization
1110 try:
1111 address, prefix, date, status = line.split("|")
1112 except ValueError:
1113 log.warning("Unhandled line format: %s" % line)
1114 return
1115
1116 # Skip anything that isn't properly assigned
1117 if not status in ("assigned", "allocated"):
1118 return
1119
1120 # Cast prefix into an integer
1121 try:
1122 prefix = int(prefix)
1123 except:
1124 log.warning("Invalid prefix: %s" % prefix)
1125 return
1126
1127 # Fix prefix length for IPv4
1128 if type == "ipv4":
1129 prefix = 32 - int(math.log(prefix, 2))
1130
1131 # Try to parse the address
1132 try:
1133 network = ipaddress.ip_network("%s/%s" % (address, prefix), strict=False)
1134 except ValueError:
1135 log.warning("Invalid IP address: %s" % address)
1136 return
1137
1138 if not self._check_parsed_network(network):
1139 return
1140
1141 self.db.execute("INSERT INTO networks(network, country, original_countries, source) \
1142 VALUES(%s, %s, %s, %s) ON CONFLICT (network) DO \
1143 UPDATE SET country = excluded.country",
1144 "%s" % network, country, [country], source_key,
1145 )
1146
1147 def _import_as_names_from_arin(self, downloader):
1148 # Delete all previously imported content
1149 self.db.execute("DELETE FROM autnums WHERE source = %s", "ARIN")
1150
1151 # Try to retrieve the feed from ftp.arin.net
1152 feed = downloader.request_lines("https://ftp.arin.net/pub/resource_registry_service/asns.csv")
1153
1154 # Walk through the file
1155 for line in csv.DictReader(feed, dialect="arin"):
1156 log.debug("Processing object: %s" % line)
1157
1158 # Fetch status
1159 status = line.get("Status")
1160
1161 # We are only interested in anything managed by ARIN
1162 if not status == "Full Registry Services":
1163 continue
1164
1165 # Fetch organization name
1166 name = line.get("Org Name")
1167
1168 # Extract ASNs
1169 first_asn = line.get("Start AS Number")
1170 last_asn = line.get("End AS Number")
1171
1172 # Cast to a number
1173 try:
1174 first_asn = int(first_asn)
1175 except TypeError as e:
1176 log.warning("Could not parse ASN '%s'" % first_asn)
1177 continue
1178
1179 try:
1180 last_asn = int(last_asn)
1181 except TypeError as e:
1182 log.warning("Could not parse ASN '%s'" % last_asn)
1183 continue
1184
1185 # Check if the range is valid
1186 if last_asn < first_asn:
1187 log.warning("Invalid ASN range %s-%s" % (first_asn, last_asn))
1188
1189 # Insert everything into the database
1190 for asn in range(first_asn, last_asn + 1):
1191 if not self._check_parsed_asn(asn):
1192 log.warning("Skipping invalid ASN %s" % asn)
1193 continue
1194
1195 self.db.execute("""
1196 INSERT INTO
1197 autnums
1198 (
1199 number,
1200 name,
1201 source
1202 )
1203 VALUES
1204 (
1205 %s, %s, %s
1206 )
1207 ON CONFLICT
1208 (
1209 number
1210 )
1211 DO NOTHING
1212 """, asn, name, "ARIN",
1213 )
1214
1215 def handle_update_announcements(self, ns):
1216 server = ns.server[0]
1217
1218 with self.db.transaction():
1219 if server.startswith("/"):
1220 self._handle_update_announcements_from_bird(server)
1221
1222 # Purge anything we never want here
1223 self.db.execute("""
1224 -- Delete default routes
1225 DELETE FROM announcements WHERE network = '::/0' OR network = '0.0.0.0/0';
1226
1227 -- Delete anything that is not global unicast address space
1228 DELETE FROM announcements WHERE family(network) = 6 AND NOT network <<= '2000::/3';
1229
1230 -- DELETE "current network" address space
1231 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '0.0.0.0/8';
1232
1233 -- DELETE local loopback address space
1234 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '127.0.0.0/8';
1235
1236 -- DELETE RFC 1918 address space
1237 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '10.0.0.0/8';
1238 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '172.16.0.0/12';
1239 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.168.0.0/16';
1240
1241 -- DELETE test, benchmark and documentation address space
1242 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.0.0/24';
1243 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.2.0/24';
1244 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.18.0.0/15';
1245 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.51.100.0/24';
1246 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '203.0.113.0/24';
1247
1248 -- DELETE CGNAT address space (RFC 6598)
1249 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '100.64.0.0/10';
1250
1251 -- DELETE link local address space
1252 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '169.254.0.0/16';
1253
1254 -- DELETE IPv6 to IPv4 (6to4) address space (RFC 3068)
1255 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.88.99.0/24';
1256 DELETE FROM announcements WHERE family(network) = 6 AND network <<= '2002::/16';
1257
1258 -- DELETE multicast and reserved address space
1259 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '224.0.0.0/4';
1260 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '240.0.0.0/4';
1261
1262 -- Delete networks that are too small to be in the global routing table
1263 DELETE FROM announcements WHERE family(network) = 6 AND masklen(network) > 48;
1264 DELETE FROM announcements WHERE family(network) = 4 AND masklen(network) > 24;
1265
1266 -- Delete any non-public or reserved ASNs
1267 DELETE FROM announcements WHERE NOT (
1268 (autnum >= 1 AND autnum <= 23455)
1269 OR
1270 (autnum >= 23457 AND autnum <= 64495)
1271 OR
1272 (autnum >= 131072 AND autnum <= 4199999999)
1273 );
1274
1275 -- Delete everything that we have not seen for 14 days
1276 DELETE FROM announcements WHERE last_seen_at <= CURRENT_TIMESTAMP - INTERVAL '14 days';
1277 """)
1278
1279 def _handle_update_announcements_from_bird(self, server):
1280 # Pre-compile the regular expression for faster searching
1281 route = re.compile(b"^\s(.+?)\s+.+?\[(?:AS(.*?))?.\]$")
1282
1283 log.info("Requesting routing table from Bird (%s)" % server)
1284
1285 aggregated_networks = []
1286
1287 # Send command to list all routes
1288 for line in self._bird_cmd(server, "show route"):
1289 m = route.match(line)
1290 if not m:
1291 # Skip empty lines
1292 if not line:
1293 pass
1294
1295 # Ignore any header lines with the name of the routing table
1296 elif line.startswith(b"Table"):
1297 pass
1298
1299 # Log anything else
1300 else:
1301 log.debug("Could not parse line: %s" % line.decode())
1302
1303 continue
1304
1305 # Fetch the extracted network and ASN
1306 network, autnum = m.groups()
1307
1308 # Decode into strings
1309 if network:
1310 network = network.decode()
1311 if autnum:
1312 autnum = autnum.decode()
1313
1314 # Collect all aggregated networks
1315 if not autnum:
1316 log.debug("%s is an aggregated network" % network)
1317 aggregated_networks.append(network)
1318 continue
1319
1320 # Insert it into the database
1321 self.db.execute("INSERT INTO announcements(network, autnum) \
1322 VALUES(%s, %s) ON CONFLICT (network) DO \
1323 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1324 network, autnum,
1325 )
1326
1327 # Process any aggregated networks
1328 for network in aggregated_networks:
1329 log.debug("Processing aggregated network %s" % network)
1330
1331 # Run "show route all" for each network
1332 for line in self._bird_cmd(server, "show route %s all" % network):
1333 # Try finding the path
1334 m = re.match(b"\s+BGP\.as_path:.* (\d+) {\d+}$", line)
1335 if m:
1336 # Select the last AS number in the path
1337 autnum = m.group(1).decode()
1338
1339 # Insert it into the database
1340 self.db.execute("INSERT INTO announcements(network, autnum) \
1341 VALUES(%s, %s) ON CONFLICT (network) DO \
1342 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1343 network, autnum,
1344 )
1345
1346 # We don't need to process any more
1347 break
1348
1349 def _bird_cmd(self, socket_path, command):
1350 # Connect to the socket
1351 s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
1352 s.connect(socket_path)
1353
1354 # Allocate some buffer
1355 buffer = b""
1356
1357 log.debug("Sending Bird command: %s" % command)
1358
1359 # Send the command
1360 s.send(b"%s\n" % command.encode())
1361
1362 while True:
1363 # Fill up the buffer
1364 buffer += s.recv(4096)
1365
1366 while True:
1367 # Search for the next newline
1368 pos = buffer.find(b"\n")
1369
1370 # If we cannot find one, we go back and read more data
1371 if pos <= 0:
1372 break
1373
1374 # Cut after the newline character
1375 pos += 1
1376
1377 # Split the line we want and keep the rest in buffer
1378 line, buffer = buffer[:pos], buffer[pos:]
1379
1380 # Try parsing any status lines
1381 if len(line) > 4 and line[:4].isdigit() and line[4] in (32, 45):
1382 code, delim, line = int(line[:4]), line[4], line[5:]
1383
1384 log.debug("Received response code %s from bird" % code)
1385
1386 # End of output
1387 if code == 0:
1388 return
1389
1390 # Ignore hello line
1391 elif code == 1:
1392 continue
1393
1394 # Otherwise return the line
1395 yield line
1396
1397 def handle_update_geofeeds(self, ns):
1398 # Sync geofeeds
1399 with self.db.transaction():
1400 # Delete all geofeeds which are no longer linked
1401 self.db.execute("""
1402 DELETE FROM
1403 geofeeds
1404 WHERE
1405 NOT EXISTS (
1406 SELECT
1407 1
1408 FROM
1409 network_geofeeds
1410 WHERE
1411 geofeeds.url = network_geofeeds.url
1412 )""",
1413 )
1414
1415 # Copy all geofeeds
1416 self.db.execute("""
1417 INSERT INTO
1418 geofeeds(
1419 url
1420 )
1421 SELECT
1422 url
1423 FROM
1424 network_geofeeds
1425 ON CONFLICT (url)
1426 DO NOTHING
1427 """,
1428 )
1429
1430 # Fetch all Geofeeds that require an update
1431 geofeeds = self.db.query("""
1432 SELECT
1433 id,
1434 url
1435 FROM
1436 geofeeds
1437 WHERE
1438 updated_at IS NULL
1439 OR
1440 updated_at <= CURRENT_TIMESTAMP - INTERVAL '1 week'
1441 ORDER BY
1442 id
1443 """)
1444
1445 with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
1446 results = executor.map(self._fetch_geofeed, geofeeds)
1447
1448 # Fetch all results to raise any exceptions
1449 for result in results:
1450 pass
1451
1452 # Delete data from any feeds that did not update in the last two weeks
1453 with self.db.transaction():
1454 self.db.execute("""
1455 DELETE FROM
1456 geofeed_networks
1457 WHERE
1458 geofeed_networks.geofeed_id IN (
1459 SELECT
1460 geofeeds.id
1461 FROM
1462 geofeeds
1463 WHERE
1464 updated_at IS NULL
1465 OR
1466 updated_at <= CURRENT_TIMESTAMP - INTERVAL '2 weeks'
1467 )
1468 """)
1469
1470 def _fetch_geofeed(self, geofeed):
1471 log.debug("Fetching Geofeed %s" % geofeed.url)
1472
1473 with self.db.transaction():
1474 # Open the URL
1475 try:
1476 req = urllib.request.Request(geofeed.url, headers={
1477 "User-Agent" : "location/%s" % location.__version__,
1478
1479 # We expect some plain text file in CSV format
1480 "Accept" : "text/csv, text/plain",
1481 })
1482
1483 # XXX set proxy
1484
1485 # Send the request
1486 with urllib.request.urlopen(req, timeout=10) as f:
1487 # Remove any previous data
1488 self.db.execute("DELETE FROM geofeed_networks \
1489 WHERE geofeed_id = %s", geofeed.id)
1490
1491 lineno = 0
1492
1493 # Read the output line by line
1494 for line in f:
1495 lineno += 1
1496
1497 try:
1498 line = line.decode()
1499
1500 # Ignore any lines we cannot decode
1501 except UnicodeDecodeError:
1502 log.debug("Could not decode line %s in %s" \
1503 % (lineno, geofeed.url))
1504 continue
1505
1506 # Strip any newline
1507 line = line.rstrip()
1508
1509 # Skip empty lines
1510 if not line:
1511 continue
1512
1513 # Try to parse the line
1514 try:
1515 fields = line.split(",", 5)
1516 except ValueError:
1517 log.debug("Could not parse line: %s" % line)
1518 continue
1519
1520 # Check if we have enough fields
1521 if len(fields) < 4:
1522 log.debug("Not enough fields in line: %s" % line)
1523 continue
1524
1525 # Fetch all fields
1526 network, country, region, city, = fields[:4]
1527
1528 # Try to parse the network
1529 try:
1530 network = ipaddress.ip_network(network, strict=False)
1531 except ValueError:
1532 log.debug("Could not parse network: %s" % network)
1533 continue
1534
1535 # Strip any excess whitespace from country codes
1536 country = country.strip()
1537
1538 # Make the country code uppercase
1539 country = country.upper()
1540
1541 # Check the country code
1542 if not country:
1543 log.debug("Empty country code in Geofeed %s line %s" \
1544 % (geofeed.url, lineno))
1545 continue
1546
1547 elif not location.country_code_is_valid(country):
1548 log.debug("Invalid country code in Geofeed %s:%s: %s" \
1549 % (geofeed.url, lineno, country))
1550 continue
1551
1552 # Write this into the database
1553 self.db.execute("""
1554 INSERT INTO
1555 geofeed_networks (
1556 geofeed_id,
1557 network,
1558 country,
1559 region,
1560 city
1561 )
1562 VALUES (%s, %s, %s, %s, %s)""",
1563 geofeed.id,
1564 "%s" % network,
1565 country,
1566 region,
1567 city,
1568 )
1569
1570 # Catch any HTTP errors
1571 except urllib.request.HTTPError as e:
1572 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1573 WHERE id = %s", e.code, "%s" % e, geofeed.id)
1574
1575 # Remove any previous data when the feed has been deleted
1576 if e.code == 404:
1577 self.db.execute("DELETE FROM geofeed_networks \
1578 WHERE geofeed_id = %s", geofeed.id)
1579
1580 # Catch any other errors and connection timeouts
1581 except (http.client.InvalidURL, urllib.request.URLError, TimeoutError) as e:
1582 log.debug("Could not fetch URL %s: %s" % (geofeed.url, e))
1583
1584 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1585 WHERE id = %s", 599, "%s" % e, geofeed.id)
1586
1587 # Mark the geofeed as updated
1588 else:
1589 self.db.execute("""
1590 UPDATE
1591 geofeeds
1592 SET
1593 updated_at = CURRENT_TIMESTAMP,
1594 status = NULL,
1595 error = NULL
1596 WHERE
1597 id = %s""",
1598 geofeed.id,
1599 )
1600
1601 def handle_update_overrides(self, ns):
1602 with self.db.transaction():
1603 # Only drop manually created overrides, as we can be reasonably sure to have them,
1604 # and preserve the rest. If appropriate, it is deleted by correspondent functions.
1605 self.db.execute("""
1606 DELETE FROM autnum_overrides WHERE source = 'manual';
1607 DELETE FROM network_overrides WHERE source = 'manual';
1608 """)
1609
1610 # Update overrides for various cloud providers big enough to publish their own IP
1611 # network allocation lists in a machine-readable format...
1612 self._update_overrides_for_aws()
1613
1614 # Update overrides for Spamhaus DROP feeds...
1615 self._update_feed_for_spamhaus_drop()
1616
1617 for file in ns.files:
1618 log.info("Reading %s..." % file)
1619
1620 with open(file, "rb") as f:
1621 for type, block in location.importer.read_blocks(f):
1622 if type == "net":
1623 network = block.get("net")
1624 # Try to parse and normalise the network
1625 try:
1626 network = ipaddress.ip_network(network, strict=False)
1627 except ValueError as e:
1628 log.warning("Invalid IP network: %s: %s" % (network, e))
1629 continue
1630
1631 # Prevent that we overwrite all networks
1632 if network.prefixlen == 0:
1633 log.warning("Skipping %s: You cannot overwrite default" % network)
1634 continue
1635
1636 self.db.execute("""
1637 INSERT INTO network_overrides(
1638 network,
1639 country,
1640 source,
1641 is_anonymous_proxy,
1642 is_satellite_provider,
1643 is_anycast,
1644 is_drop
1645 ) VALUES (%s, %s, %s, %s, %s, %s, %s)
1646 ON CONFLICT (network) DO NOTHING""",
1647 "%s" % network,
1648 block.get("country"),
1649 "manual",
1650 self._parse_bool(block, "is-anonymous-proxy"),
1651 self._parse_bool(block, "is-satellite-provider"),
1652 self._parse_bool(block, "is-anycast"),
1653 self._parse_bool(block, "drop"),
1654 )
1655
1656 elif type == "aut-num":
1657 autnum = block.get("aut-num")
1658
1659 # Check if AS number begins with "AS"
1660 if not autnum.startswith("AS"):
1661 log.warning("Invalid AS number: %s" % autnum)
1662 continue
1663
1664 # Strip "AS"
1665 autnum = autnum[2:]
1666
1667 self.db.execute("""
1668 INSERT INTO autnum_overrides(
1669 number,
1670 name,
1671 country,
1672 source,
1673 is_anonymous_proxy,
1674 is_satellite_provider,
1675 is_anycast,
1676 is_drop
1677 ) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)
1678 ON CONFLICT DO NOTHING""",
1679 autnum,
1680 block.get("name"),
1681 block.get("country"),
1682 "manual",
1683 self._parse_bool(block, "is-anonymous-proxy"),
1684 self._parse_bool(block, "is-satellite-provider"),
1685 self._parse_bool(block, "is-anycast"),
1686 self._parse_bool(block, "drop"),
1687 )
1688
1689 else:
1690 log.warning("Unsupported type: %s" % type)
1691
1692 def _update_overrides_for_aws(self):
1693 # Download Amazon AWS IP allocation file to create overrides...
1694 downloader = location.importer.Downloader()
1695
1696 try:
1697 # Fetch IP ranges
1698 f = downloader.retrieve("https://ip-ranges.amazonaws.com/ip-ranges.json")
1699
1700 # Parse downloaded file
1701 aws_ip_dump = json.load(f)
1702 except Exception as e:
1703 log.error("unable to preprocess Amazon AWS IP ranges: %s" % e)
1704 return
1705
1706 # At this point, we can assume the downloaded file to be valid
1707 self.db.execute("""
1708 DELETE FROM network_feeds WHERE source = 'Amazon AWS IP feed'
1709 """)
1710
1711 # XXX: Set up a dictionary for mapping a region name to a country. Unfortunately,
1712 # there seems to be no machine-readable version available of this other than
1713 # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html
1714 # (worse, it seems to be incomplete :-/ ); https://www.cloudping.cloud/endpoints
1715 # was helpful here as well.
1716 aws_region_country_map = {
1717 "af-south-1": "ZA",
1718 "ap-east-1": "HK",
1719 "ap-south-1": "IN",
1720 "ap-south-2": "IN",
1721 "ap-northeast-3": "JP",
1722 "ap-northeast-2": "KR",
1723 "ap-southeast-1": "SG",
1724 "ap-southeast-2": "AU",
1725 "ap-southeast-3": "MY",
1726 "ap-southeast-4": "AU",
1727 "ap-southeast-5": "NZ", # Auckland, NZ
1728 "ap-southeast-6": "AP", # XXX: Precise location not documented anywhere
1729 "ap-northeast-1": "JP",
1730 "ca-central-1": "CA",
1731 "ca-west-1": "CA",
1732 "eu-central-1": "DE",
1733 "eu-central-2": "CH",
1734 "eu-west-1": "IE",
1735 "eu-west-2": "GB",
1736 "eu-south-1": "IT",
1737 "eu-south-2": "ES",
1738 "eu-west-3": "FR",
1739 "eu-north-1": "SE",
1740 "eusc-de-east-1" : "DE", # XXX: Undocumented, likely located in Berlin rather than Frankfurt
1741 "il-central-1": "IL", # XXX: This one is not documented anywhere except for ip-ranges.json itself
1742 "me-central-1": "AE",
1743 "me-south-1": "BH",
1744 "sa-east-1": "BR"
1745 }
1746
1747 # Fetch all valid country codes to check parsed networks aganist...
1748 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
1749 validcountries = []
1750
1751 for row in rows:
1752 validcountries.append(row.country_code)
1753
1754 with self.db.transaction():
1755 for snetwork in aws_ip_dump["prefixes"] + aws_ip_dump["ipv6_prefixes"]:
1756 try:
1757 network = ipaddress.ip_network(snetwork.get("ip_prefix") or snetwork.get("ipv6_prefix"), strict=False)
1758 except ValueError:
1759 log.warning("Unable to parse line: %s" % snetwork)
1760 continue
1761
1762 # Sanitize parsed networks...
1763 if not self._check_parsed_network(network):
1764 continue
1765
1766 # Determine region of this network...
1767 region = snetwork["region"]
1768 cc = None
1769 is_anycast = False
1770
1771 # Any region name starting with "us-" will get "US" country code assigned straight away...
1772 if region.startswith("us-"):
1773 cc = "US"
1774 elif region.startswith("cn-"):
1775 # ... same goes for China ...
1776 cc = "CN"
1777 elif region == "GLOBAL":
1778 # ... funny region name for anycast-like networks ...
1779 is_anycast = True
1780 elif region in aws_region_country_map:
1781 # ... assign looked up country code otherwise ...
1782 cc = aws_region_country_map[region]
1783 else:
1784 # ... and bail out if we are missing something here
1785 log.warning("Unable to determine country code for line: %s" % snetwork)
1786 continue
1787
1788 # Skip networks with unknown country codes
1789 if not is_anycast and validcountries and cc not in validcountries:
1790 log.warning("Skipping Amazon AWS network with bogus country '%s': %s" % \
1791 (cc, network))
1792 return
1793
1794 # Conduct SQL statement...
1795 self.db.execute("""
1796 INSERT INTO
1797 network_feeds
1798 (
1799 network,
1800 source,
1801 country,
1802 is_anycast
1803 )
1804 VALUES
1805 (
1806 %s, %s, %s, %s
1807 )
1808 ON CONFLICT (network, source) DO NOTHING
1809 """, "%s" % network, "Amazon AWS IP feed", cc, is_anycast,
1810 )
1811
1812 def _update_feed_for_spamhaus_drop(self):
1813 downloader = location.importer.Downloader()
1814
1815 ip_lists = [
1816 ("SPAMHAUS-DROP", "https://www.spamhaus.org/drop/drop.txt"),
1817 ("SPAMHAUS-EDROP", "https://www.spamhaus.org/drop/edrop.txt"),
1818 ("SPAMHAUS-DROPV6", "https://www.spamhaus.org/drop/dropv6.txt")
1819 ]
1820
1821 asn_lists = [
1822 ("SPAMHAUS-ASNDROP", "https://www.spamhaus.org/drop/asndrop.json")
1823 ]
1824
1825 for name, url in ip_lists:
1826 # Fetch IP list from given URL
1827 f = downloader.retrieve(url)
1828
1829 # Split into lines
1830 fcontent = f.readlines()
1831
1832 with self.db.transaction():
1833 # Conduct a very basic sanity check to rule out CDN issues causing bogus DROP
1834 # downloads.
1835 if len(fcontent) > 10:
1836 self.db.execute("DELETE FROM network_feeds WHERE source = %s", name)
1837 else:
1838 log.warning("%s (%s) returned likely bogus file, ignored" % (name, url))
1839 continue
1840
1841 # Iterate through every line, filter comments and add remaining networks to
1842 # the override table in case they are valid...
1843 for sline in fcontent:
1844 # The response is assumed to be encoded in UTF-8...
1845 sline = sline.decode("utf-8")
1846
1847 # Comments start with a semicolon...
1848 if sline.startswith(";"):
1849 continue
1850
1851 # Extract network and ignore anything afterwards...
1852 try:
1853 network = ipaddress.ip_network(sline.split()[0], strict=False)
1854 except ValueError:
1855 log.error("Unable to parse line: %s" % sline)
1856 continue
1857
1858 # Sanitize parsed networks...
1859 if not self._check_parsed_network(network):
1860 log.warning("Skipping bogus network found in %s (%s): %s" % \
1861 (name, url, network))
1862 continue
1863
1864 # Conduct SQL statement...
1865 self.db.execute("""
1866 INSERT INTO
1867 network_feeds
1868 (
1869 network,
1870 source,
1871 is_drop
1872 )
1873 VALUES
1874 (
1875 %s, %s, %s
1876 )""", "%s" % network, name, True,
1877 )
1878
1879 for name, url in asn_lists:
1880 # Fetch URL
1881 f = downloader.retrieve(url)
1882
1883 # Split into lines
1884 fcontent = f.readlines()
1885
1886 with self.db.transaction():
1887 # Conduct a very basic sanity check to rule out CDN issues causing bogus DROP
1888 # downloads.
1889 if len(fcontent) > 10:
1890 self.db.execute("DELETE FROM autnum_feeds WHERE source = %s", name)
1891 else:
1892 log.warning("%s (%s) returned likely bogus file, ignored" % (name, url))
1893 continue
1894
1895 # Iterate through every line, filter comments and add remaining ASNs to
1896 # the override table in case they are valid...
1897 for sline in fcontent:
1898 # The response is assumed to be encoded in UTF-8...
1899 sline = sline.decode("utf-8")
1900
1901 # Load every line as a JSON object and try to obtain an ASN from it...
1902 try:
1903 lineobj = json.loads(sline)
1904 except json.decoder.JSONDecodeError:
1905 log.error("Unable to parse line as a JSON object: %s" % sline)
1906 continue
1907
1908 # Skip line contiaining file metadata
1909 try:
1910 type = lineobj["type"]
1911
1912 if type == "metadata":
1913 continue
1914 except KeyError:
1915 pass
1916
1917 try:
1918 asn = lineobj["asn"]
1919 as_name = lineobj["asname"]
1920 except KeyError:
1921 log.warning("Unable to extract necessary information from line: %s" % sline)
1922 continue
1923
1924 # Filter invalid ASNs...
1925 if not self._check_parsed_asn(asn):
1926 log.warning("Skipping bogus ASN found in %s (%s): %s" % \
1927 (name, url, asn))
1928 continue
1929
1930 # Conduct SQL statement...
1931 self.db.execute("""
1932 INSERT INTO
1933 autnum_feeds
1934 (
1935 number,
1936 source,
1937 is_drop
1938 )
1939 VALUES
1940 (
1941 %s, %s, %s
1942 )""", "%s" % asn, name, True,
1943 )
1944
1945 @staticmethod
1946 def _parse_bool(block, key):
1947 val = block.get(key)
1948
1949 # There is no point to proceed when we got None
1950 if val is None:
1951 return
1952
1953 # Convert to lowercase
1954 val = val.lower()
1955
1956 # True
1957 if val in ("yes", "1"):
1958 return True
1959
1960 # False
1961 if val in ("no", "0"):
1962 return False
1963
1964 # Default to None
1965 return None
1966
1967 @property
1968 def countries(self):
1969 # Fetch all valid country codes to check parsed networks aganist
1970 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
1971
1972 # Return all countries
1973 return [row.country_code for row in rows]
1974
1975 def handle_import_countries(self, ns):
1976 with self.db.transaction():
1977 # Drop all data that we have
1978 self.db.execute("TRUNCATE TABLE countries")
1979
1980 for file in ns.file:
1981 for line in file:
1982 line = line.rstrip()
1983
1984 # Ignore any comments
1985 if line.startswith("#"):
1986 continue
1987
1988 try:
1989 country_code, continent_code, name = line.split(maxsplit=2)
1990 except:
1991 log.warning("Could not parse line: %s" % line)
1992 continue
1993
1994 self.db.execute("INSERT INTO countries(country_code, name, continent_code) \
1995 VALUES(%s, %s, %s) ON CONFLICT DO NOTHING", country_code, name, continent_code)
1996
1997
1998 def split_line(line):
1999 key, colon, val = line.partition(":")
2000
2001 # Strip any excess space
2002 key = key.strip()
2003 val = val.strip()
2004
2005 return key, val
2006
2007 def main():
2008 # Run the command line interface
2009 c = CLI()
2010 c.run()
2011
2012 main()