]> git.ipfire.org Git - people/ms/libloc.git/blame - src/scripts/location-importer.in
importer: Convert networks back to string
[people/ms/libloc.git] / src / scripts / location-importer.in
CommitLineData
78ff0cf2
MT
1#!/usr/bin/python3
2###############################################################################
3# #
4# libloc - A library to determine the location of someone on the Internet #
5# #
5c7cfeb2 6# Copyright (C) 2020-2024 IPFire Development Team <info@ipfire.org> #
78ff0cf2
MT
7# #
8# This library is free software; you can redistribute it and/or #
9# modify it under the terms of the GNU Lesser General Public #
10# License as published by the Free Software Foundation; either #
11# version 2.1 of the License, or (at your option) any later version. #
12# #
13# This library is distributed in the hope that it will be useful, #
14# but WITHOUT ANY WARRANTY; without even the implied warranty of #
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
16# Lesser General Public License for more details. #
17# #
18###############################################################################
19
20import argparse
98d9079b 21import asyncio
f4f3d8ad 22import csv
81b28603 23import functools
a6fedd9f 24import http.client
fa5dccf1 25import io
6ffd06b5 26import ipaddress
dcef2ba4 27import json
78ff0cf2 28import logging
6ffd06b5
MT
29import math
30import re
22d8d199 31import socket
78ff0cf2 32import sys
3ce28dea 33import urllib.error
78ff0cf2
MT
34
35# Load our location module
36import location
29c6fa22 37import location.database
335946c0 38from location.downloader import Downloader
78ff0cf2
MT
39from location.i18n import _
40
41# Initialise logging
42log = logging.getLogger("location.importer")
43log.propagate = 1
44
43fe570c
PM
45# Define constants
46VALID_ASN_RANGES = (
47 (1, 23455),
48 (23457, 64495),
49 (131072, 4199999999),
50)
51
afc966db
MT
52TRANSLATED_COUNTRIES = {
53 # When people say UK, they mean GB
54 "UK" : "GB",
55}
56
30d32299
MT
57IGNORED_COUNTRIES = set((
58 # Formerly Yugoslavia
59 "YU",
60
61 # Some people use ZZ to say "no country" or to hide the country
62 "ZZ",
63))
64
18a72eac
MT
65# Configure the CSV parser for ARIN
66csv.register_dialect("arin", delimiter=",", quoting=csv.QUOTE_ALL, quotechar="\"")
43fe570c 67
78ff0cf2
MT
68class CLI(object):
69 def parse_cli(self):
70 parser = argparse.ArgumentParser(
71 description=_("Location Importer Command Line Interface"),
72 )
6ffd06b5 73 subparsers = parser.add_subparsers()
78ff0cf2
MT
74
75 # Global configuration flags
76 parser.add_argument("--debug", action="store_true",
77 help=_("Enable debug output"))
bc1f5f53
MT
78 parser.add_argument("--quiet", action="store_true",
79 help=_("Enable quiet mode"))
78ff0cf2
MT
80
81 # version
82 parser.add_argument("--version", action="version",
83 version="%(prog)s @VERSION@")
84
29c6fa22
MT
85 # Database
86 parser.add_argument("--database-host", required=True,
87 help=_("Database Hostname"), metavar=_("HOST"))
88 parser.add_argument("--database-name", required=True,
89 help=_("Database Name"), metavar=_("NAME"))
90 parser.add_argument("--database-username", required=True,
91 help=_("Database Username"), metavar=_("USERNAME"))
92 parser.add_argument("--database-password", required=True,
93 help=_("Database Password"), metavar=_("PASSWORD"))
94
0983f3dd
MT
95 # Write Database
96 write = subparsers.add_parser("write", help=_("Write database to file"))
97 write.set_defaults(func=self.handle_write)
98 write.add_argument("file", nargs=1, help=_("Database File"))
99 write.add_argument("--signing-key", nargs="?", type=open, help=_("Signing Key"))
1164d876 100 write.add_argument("--backup-signing-key", nargs="?", type=open, help=_("Backup Signing Key"))
0983f3dd
MT
101 write.add_argument("--vendor", nargs="?", help=_("Sets the vendor"))
102 write.add_argument("--description", nargs="?", help=_("Sets a description"))
103 write.add_argument("--license", nargs="?", help=_("Sets the license"))
b904896a 104 write.add_argument("--version", type=int, help=_("Database Format Version"))
0983f3dd 105
6ffd06b5
MT
106 # Update WHOIS
107 update_whois = subparsers.add_parser("update-whois", help=_("Update WHOIS Information"))
d77d05d1
MT
108 update_whois.add_argument("sources", nargs="*",
109 help=_("Only update these sources"))
6ffd06b5
MT
110 update_whois.set_defaults(func=self.handle_update_whois)
111
83d61c46
MT
112 # Update announcements
113 update_announcements = subparsers.add_parser("update-announcements",
114 help=_("Update BGP Annoucements"))
115 update_announcements.set_defaults(func=self.handle_update_announcements)
116 update_announcements.add_argument("server", nargs=1,
117 help=_("Route Server to connect to"), metavar=_("SERVER"))
118
0fcdd689
MT
119 # Update geofeeds
120 update_geofeeds = subparsers.add_parser("update-geofeeds",
121 help=_("Update Geofeeds"))
122 update_geofeeds.set_defaults(func=self.handle_update_geofeeds)
123
93aad7f7
MT
124 # Update feeds
125 update_feeds = subparsers.add_parser("update-feeds",
126 help=_("Update Feeds"))
f7f8e714
MT
127 update_feeds.add_argument("feeds", nargs="*",
128 help=_("Only update these feeds"))
93aad7f7
MT
129 update_feeds.set_defaults(func=self.handle_update_feeds)
130
d7fc3057
MT
131 # Update overrides
132 update_overrides = subparsers.add_parser("update-overrides",
133 help=_("Update overrides"),
134 )
135 update_overrides.add_argument(
136 "files", nargs="+", help=_("Files to import"),
137 )
138 update_overrides.set_defaults(func=self.handle_update_overrides)
139
8084b33a
MT
140 # Import countries
141 import_countries = subparsers.add_parser("import-countries",
142 help=_("Import countries"),
143 )
144 import_countries.add_argument("file", nargs=1, type=argparse.FileType("r"),
145 help=_("File to import"))
146 import_countries.set_defaults(func=self.handle_import_countries)
147
78ff0cf2
MT
148 args = parser.parse_args()
149
bc1f5f53 150 # Configure logging
78ff0cf2 151 if args.debug:
f9de5e61 152 location.logger.set_level(logging.DEBUG)
bc1f5f53
MT
153 elif args.quiet:
154 location.logger.set_level(logging.WARNING)
78ff0cf2 155
6ffd06b5
MT
156 # Print usage if no action was given
157 if not "func" in args:
158 parser.print_usage()
159 sys.exit(2)
160
78ff0cf2
MT
161 return args
162
98d9079b 163 async def run(self):
78ff0cf2
MT
164 # Parse command line arguments
165 args = self.parse_cli()
166
335946c0
MT
167 # Initialize the downloader
168 self.downloader = Downloader()
169
29c6fa22 170 # Initialise database
6ffd06b5 171 self.db = self._setup_database(args)
29c6fa22 172
78ff0cf2 173 # Call function
98d9079b 174 ret = await args.func(args)
78ff0cf2
MT
175
176 # Return with exit code
177 if ret:
178 sys.exit(ret)
179
180 # Otherwise just exit
181 sys.exit(0)
182
29c6fa22
MT
183 def _setup_database(self, ns):
184 """
185 Initialise the database
186 """
187 # Connect to database
188 db = location.database.Connection(
189 host=ns.database_host, database=ns.database_name,
190 user=ns.database_username, password=ns.database_password,
191 )
192
193 with db.transaction():
194 db.execute("""
83d61c46
MT
195 -- announcements
196 CREATE TABLE IF NOT EXISTS announcements(network inet, autnum bigint,
197 first_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
198 last_seen_at timestamp without time zone DEFAULT CURRENT_TIMESTAMP);
199 CREATE UNIQUE INDEX IF NOT EXISTS announcements_networks ON announcements(network);
200 CREATE INDEX IF NOT EXISTS announcements_family ON announcements(family(network));
a1707d89 201 CREATE INDEX IF NOT EXISTS announcements_search ON announcements USING GIST(network inet_ops);
83d61c46 202
6ffd06b5 203 -- autnums
0983f3dd 204 CREATE TABLE IF NOT EXISTS autnums(number bigint, name text NOT NULL);
26f06e70 205 ALTER TABLE autnums ADD COLUMN IF NOT EXISTS source text;
6ffd06b5
MT
206 CREATE UNIQUE INDEX IF NOT EXISTS autnums_number ON autnums(number);
207
8084b33a
MT
208 -- countries
209 CREATE TABLE IF NOT EXISTS countries(
210 country_code text NOT NULL, name text NOT NULL, continent_code text NOT NULL);
211 CREATE UNIQUE INDEX IF NOT EXISTS countries_country_code ON countries(country_code);
212
429a43d1 213 -- networks
83d61c46 214 CREATE TABLE IF NOT EXISTS networks(network inet, country text);
b6b2b331 215 ALTER TABLE networks ADD COLUMN IF NOT EXISTS original_countries text[];
26f06e70 216 ALTER TABLE networks ADD COLUMN IF NOT EXISTS source text;
429a43d1 217 CREATE UNIQUE INDEX IF NOT EXISTS networks_network ON networks(network);
002deb6b 218 CREATE INDEX IF NOT EXISTS networks_family ON networks USING BTREE(family(network));
83d61c46 219 CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
d7fc3057 220
183b2f74 221 -- geofeeds
0fcdd689
MT
222 CREATE TABLE IF NOT EXISTS geofeeds(
223 id serial primary key,
224 url text,
225 status integer default null,
226 updated_at timestamp without time zone default null
227 );
ce308393 228 ALTER TABLE geofeeds ADD COLUMN IF NOT EXISTS error text;
0fcdd689
MT
229 CREATE UNIQUE INDEX IF NOT EXISTS geofeeds_unique
230 ON geofeeds(url);
231 CREATE TABLE IF NOT EXISTS geofeed_networks(
232 geofeed_id integer references geofeeds(id) on delete cascade,
233 network inet,
234 country text,
235 region text,
236 city text
237 );
238 CREATE INDEX IF NOT EXISTS geofeed_networks_geofeed_id
239 ON geofeed_networks(geofeed_id);
240 CREATE INDEX IF NOT EXISTS geofeed_networks_search
47fc42fb 241 ON geofeed_networks USING GIST(network inet_ops);
183b2f74 242 CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text);
b8f3075b 243 ALTER TABLE network_geofeeds ADD COLUMN IF NOT EXISTS source text NOT NULL;
2b96ed27
MT
244 CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique2
245 ON network_geofeeds(network, url);
7f0a6add
MT
246 CREATE INDEX IF NOT EXISTS network_geofeeds_search
247 ON network_geofeeds USING GIST(network inet_ops);
248 CREATE INDEX IF NOT EXISTS network_geofeeds_url
249 ON network_geofeeds(url);
2b96ed27 250 DROP INDEX IF EXISTS network_geofeeds_unique;
183b2f74 251
7808c8b7
MT
252 -- feeds
253 CREATE TABLE IF NOT EXISTS autnum_feeds(
254 number bigint NOT NULL,
255 source text NOT NULL,
256 name text,
257 country text,
258 is_anonymous_proxy boolean,
259 is_satellite_provider boolean,
260 is_anycast boolean,
261 is_drop boolean
262 );
263 CREATE UNIQUE INDEX IF NOT EXISTS autnum_feeds_unique
264 ON autnum_feeds(number, source);
265
c923c1cc
MT
266 CREATE TABLE IF NOT EXISTS network_feeds(
267 network inet NOT NULL,
268 source text NOT NULL,
269 country text,
270 is_anonymous_proxy boolean,
271 is_satellite_provider boolean,
272 is_anycast boolean,
273 is_drop boolean
274 );
275 CREATE UNIQUE INDEX IF NOT EXISTS network_feeds_unique
276 ON network_feeds(network, source);
277 CREATE INDEX IF NOT EXISTS network_feeds_search
278 ON network_feeds USING GIST(network inet_ops);
279
d7fc3057
MT
280 -- overrides
281 CREATE TABLE IF NOT EXISTS autnum_overrides(
282 number bigint NOT NULL,
283 name text,
bd1aa6a1 284 country text,
b8e25b71
MT
285 is_anonymous_proxy boolean,
286 is_satellite_provider boolean,
287 is_anycast boolean
d7fc3057
MT
288 );
289 CREATE UNIQUE INDEX IF NOT EXISTS autnum_overrides_number
290 ON autnum_overrides(number);
e17e804e 291 ALTER TABLE autnum_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
667ad134 292 ALTER TABLE autnum_overrides DROP COLUMN IF EXISTS source;
d7fc3057
MT
293
294 CREATE TABLE IF NOT EXISTS network_overrides(
295 network inet NOT NULL,
296 country text,
b8e25b71
MT
297 is_anonymous_proxy boolean,
298 is_satellite_provider boolean,
299 is_anycast boolean
d7fc3057
MT
300 );
301 CREATE UNIQUE INDEX IF NOT EXISTS network_overrides_network
302 ON network_overrides(network);
991baf53
MT
303 CREATE INDEX IF NOT EXISTS network_overrides_search
304 ON network_overrides USING GIST(network inet_ops);
e17e804e 305 ALTER TABLE network_overrides ADD COLUMN IF NOT EXISTS is_drop boolean;
667ad134 306 ALTER TABLE network_overrides DROP COLUMN IF EXISTS source;
3efe0297 307
f5a4557d 308 DROP TABLE IF EXISTS geofeed_overrides;
29c6fa22
MT
309 """)
310
311 return db
312
e61a9352
MT
313 def fetch_countries(self):
314 """
315 Returns a list of all countries on the list
316 """
317 # Fetch all valid country codes to check parsed networks aganist...
318 countries = self.db.query("SELECT country_code FROM countries ORDER BY country_code")
319
ae20698a 320 return set((country.country_code for country in countries))
e61a9352 321
98d9079b 322 async def handle_write(self, ns):
0983f3dd
MT
323 """
324 Compiles a database in libloc format out of what is in the database
325 """
0983f3dd 326 # Allocate a writer
1164d876 327 writer = location.Writer(ns.signing_key, ns.backup_signing_key)
0983f3dd
MT
328
329 # Set all metadata
330 if ns.vendor:
331 writer.vendor = ns.vendor
332
333 if ns.description:
334 writer.description = ns.description
335
336 if ns.license:
337 writer.license = ns.license
338
339 # Add all Autonomous Systems
340 log.info("Writing Autonomous Systems...")
341
342 # Select all ASes with a name
6e97c44b
MT
343 rows = self.db.query("""
344 SELECT
345 autnums.number AS number,
346 COALESCE(
ef1581fb 347 overrides.name,
6e97c44b
MT
348 autnums.name
349 ) AS name
ef1581fb
MT
350 FROM
351 autnums
352 LEFT JOIN
353 autnum_overrides overrides ON autnums.number = overrides.number
354 ORDER BY
355 autnums.number
356 """)
0983f3dd
MT
357
358 for row in rows:
008d5468
MT
359 # Skip AS without names
360 if not row.name:
361 continue
362
0983f3dd
MT
363 a = writer.add_as(row.number)
364 a.name = row.name
365
366 # Add all networks
367 log.info("Writing networks...")
368
369 # Select all known networks
370 rows = self.db.query("""
295a7774
MT
371 WITH known_networks AS (
372 SELECT network FROM announcements
373 UNION
374 SELECT network FROM networks
375 UNION
25b60228 376 SELECT network FROM network_feeds
c923c1cc 377 UNION
295a7774 378 SELECT network FROM network_overrides
e87dbd44
MT
379 UNION
380 SELECT network FROM geofeed_networks
295a7774
MT
381 ),
382
383 ordered_networks AS (
384 SELECT
385 known_networks.network AS network,
386 announcements.autnum AS autnum,
387 networks.country AS country,
388
389 -- Must be part of returned values for ORDER BY clause
390 masklen(announcements.network) AS sort_a,
391 masklen(networks.network) AS sort_b
392 FROM
393 known_networks
394 LEFT JOIN
395 announcements ON known_networks.network <<= announcements.network
396 LEFT JOIN
397 networks ON known_networks.network <<= networks.network
398 ORDER BY
399 known_networks.network,
400 sort_a DESC,
401 sort_b DESC
402 )
403
5372d9c7
MT
404 -- Return a list of those networks enriched with all
405 -- other information that we store in the database
0983f3dd 406 SELECT
bbea93a7
MT
407 DISTINCT ON (network)
408 network,
409 autnum,
bd1aa6a1
MT
410
411 -- Country
412 COALESCE(
413 (
476297aa
MT
414 SELECT
415 country
416 FROM
417 network_overrides overrides
418 WHERE
419 networks.network <<= overrides.network
420 ORDER BY
421 masklen(overrides.network) DESC
422 LIMIT 1
bd1aa6a1
MT
423 ),
424 (
476297aa
MT
425 SELECT
426 country
427 FROM
428 autnum_overrides overrides
429 WHERE
430 networks.autnum = overrides.number
bd1aa6a1 431 ),
c923c1cc 432 (
476297aa
MT
433 SELECT
434 country
435 FROM
436 network_feeds feeds
437 WHERE
438 networks.network <<= feeds.network
439 ORDER BY
440 masklen(feeds.network) DESC
441 LIMIT 1
c923c1cc 442 ),
7808c8b7 443 (
476297aa
MT
444 SELECT
445 country
446 FROM
447 autnum_feeds feeds
448 WHERE
449 networks.autnum = feeds.number
450 ORDER BY
451 source
452 LIMIT 1
7808c8b7 453 ),
e87dbd44
MT
454 (
455 SELECT
456 geofeed_networks.country AS country
457 FROM
458 network_geofeeds
459
460 -- Join the data from the geofeeds
461 LEFT JOIN
462 geofeeds ON network_geofeeds.url = geofeeds.url
463 LEFT JOIN
464 geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id
465
466 -- Check whether we have a geofeed for this network
467 WHERE
468 networks.network <<= network_geofeeds.network
469 AND
470 networks.network <<= geofeed_networks.network
471
472 -- Filter for the best result
473 ORDER BY
474 masklen(geofeed_networks.network) DESC
475 LIMIT 1
476 ),
bd1aa6a1
MT
477 networks.country
478 ) AS country,
8e8555bb 479
0983f3dd 480 -- Flags
1422b5d4
MT
481 COALESCE(
482 (
476297aa
MT
483 SELECT
484 is_anonymous_proxy
485 FROM
486 network_overrides overrides
487 WHERE
488 networks.network <<= overrides.network
489 ORDER BY
490 masklen(overrides.network) DESC
491 LIMIT 1
1422b5d4 492 ),
c923c1cc 493 (
476297aa
MT
494 SELECT
495 is_anonymous_proxy
496 FROM
497 network_feeds feeds
498 WHERE
499 networks.network <<= feeds.network
500 ORDER BY
501 masklen(feeds.network) DESC
502 LIMIT 1
c923c1cc 503 ),
7808c8b7 504 (
476297aa
MT
505 SELECT
506 is_anonymous_proxy
507 FROM
508 autnum_feeds feeds
509 WHERE
510 networks.autnum = feeds.number
511 ORDER BY
512 source
513 LIMIT 1
7808c8b7 514 ),
1422b5d4 515 (
476297aa
MT
516 SELECT
517 is_anonymous_proxy
518 FROM
519 autnum_overrides overrides
520 WHERE
521 networks.autnum = overrides.number
b8e25b71
MT
522 ),
523 FALSE
1422b5d4
MT
524 ) AS is_anonymous_proxy,
525 COALESCE(
526 (
476297aa
MT
527 SELECT
528 is_satellite_provider
529 FROM
530 network_overrides overrides
531 WHERE
532 networks.network <<= overrides.network
533 ORDER BY
534 masklen(overrides.network) DESC
535 LIMIT 1
1422b5d4 536 ),
c923c1cc 537 (
476297aa
MT
538 SELECT
539 is_satellite_provider
540 FROM
541 network_feeds feeds
542 WHERE
543 networks.network <<= feeds.network
544 ORDER BY
545 masklen(feeds.network) DESC
546 LIMIT 1
c923c1cc 547 ),
7808c8b7 548 (
476297aa
MT
549 SELECT
550 is_satellite_provider
551 FROM
552 autnum_feeds feeds
553 WHERE
554 networks.autnum = feeds.number
555 ORDER BY
556 source
557 LIMIT 1
7808c8b7 558 ),
1422b5d4 559 (
476297aa
MT
560 SELECT
561 is_satellite_provider
562 FROM
563 autnum_overrides overrides
564 WHERE
565 networks.autnum = overrides.number
b8e25b71
MT
566 ),
567 FALSE
1422b5d4
MT
568 ) AS is_satellite_provider,
569 COALESCE(
570 (
476297aa
MT
571 SELECT
572 is_anycast
573 FROM
574 network_overrides overrides
575 WHERE
576 networks.network <<= overrides.network
577 ORDER BY
578 masklen(overrides.network) DESC
579 LIMIT 1
1422b5d4 580 ),
c923c1cc 581 (
476297aa
MT
582 SELECT
583 is_anycast
584 FROM
585 network_feeds feeds
586 WHERE
587 networks.network <<= feeds.network
588 ORDER BY
589 masklen(feeds.network) DESC
590 LIMIT 1
c923c1cc 591 ),
7808c8b7 592 (
476297aa
MT
593 SELECT
594 is_anycast
595 FROM
596 autnum_feeds feeds
597 WHERE
598 networks.autnum = feeds.number
599 ORDER BY
600 source
601 LIMIT 1
7808c8b7 602 ),
1422b5d4 603 (
476297aa
MT
604 SELECT
605 is_anycast
606 FROM
607 autnum_overrides overrides
608 WHERE
609 networks.autnum = overrides.number
b8e25b71
MT
610 ),
611 FALSE
e17e804e
PM
612 ) AS is_anycast,
613 COALESCE(
614 (
476297aa
MT
615 SELECT
616 is_drop
617 FROM
618 network_overrides overrides
619 WHERE
620 networks.network <<= overrides.network
621 ORDER BY
622 masklen(overrides.network) DESC
623 LIMIT 1
e17e804e 624 ),
c923c1cc 625 (
476297aa
MT
626 SELECT
627 is_drop
628 FROM
629 network_feeds feeds
630 WHERE
631 networks.network <<= feeds.network
632 ORDER BY
633 masklen(feeds.network) DESC
634 LIMIT 1
c923c1cc 635 ),
7808c8b7 636 (
476297aa
MT
637 SELECT
638 is_drop
639 FROM
640 autnum_feeds feeds
641 WHERE
642 networks.autnum = feeds.number
643 ORDER BY
644 source
645 LIMIT 1
7808c8b7 646 ),
e17e804e 647 (
476297aa
MT
648 SELECT
649 is_drop
650 FROM
651 autnum_overrides overrides
652 WHERE
653 networks.autnum = overrides.number
e17e804e
PM
654 ),
655 FALSE
656 ) AS is_drop
295a7774
MT
657 FROM
658 ordered_networks networks
0983f3dd
MT
659 """)
660
661 for row in rows:
a7db4f94 662 network = writer.add_network("%s" % row.network)
0983f3dd 663
5372d9c7
MT
664 # Save country
665 if row.country:
666 network.country_code = row.country
667
668 # Save ASN
669 if row.autnum:
670 network.asn = row.autnum
0983f3dd
MT
671
672 # Set flags
673 if row.is_anonymous_proxy:
674 network.set_flag(location.NETWORK_FLAG_ANONYMOUS_PROXY)
675
676 if row.is_satellite_provider:
677 network.set_flag(location.NETWORK_FLAG_SATELLITE_PROVIDER)
678
679 if row.is_anycast:
680 network.set_flag(location.NETWORK_FLAG_ANYCAST)
681
e17e804e
PM
682 if row.is_drop:
683 network.set_flag(location.NETWORK_FLAG_DROP)
684
8084b33a
MT
685 # Add all countries
686 log.info("Writing countries...")
687 rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
688
689 for row in rows:
690 c = writer.add_country(row.country_code)
691 c.continent_code = row.continent_code
692 c.name = row.name
693
0983f3dd
MT
694 # Write everything to file
695 log.info("Writing database to file...")
696 for file in ns.file:
697 writer.write(file)
698
98d9079b 699 async def handle_update_whois(self, ns):
3ce28dea 700 # Did we run successfully?
8c2dbbc2
MT
701 success = True
702
703 sources = (
704 # African Network Information Centre
705 ("AFRINIC", (
706 (self._import_standard_format, "https://ftp.afrinic.net/pub/pub/dbase/afrinic.db.gz"),
707 )),
708
709 # Asia Pacific Network Information Centre
710 ("APNIC", (
711 (self._import_standard_format, "https://ftp.apnic.net/apnic/whois/apnic.db.inet6num.gz"),
712 (self._import_standard_format, "https://ftp.apnic.net/apnic/whois/apnic.db.inetnum.gz"),
713 (self._import_standard_format, "https://ftp.apnic.net/apnic/whois/apnic.db.aut-num.gz"),
714 (self._import_standard_format, "https://ftp.apnic.net/apnic/whois/apnic.db.organisation.gz"),
715 )),
716
717 # American Registry for Internet Numbers
718 ("ARIN", (
719 (self._import_extended_format, "https://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest"),
720 (self._import_arin_as_names, "https://ftp.arin.net/pub/resource_registry_service/asns.csv"),
721 )),
722
723 # Japan Network Information Center
724 ("JPNIC", (
725 (self._import_standard_format, "https://ftp.nic.ad.jp/jpirr/jpirr.db.gz"),
726 )),
727
728 # Latin America and Caribbean Network Information Centre
729 ("LACNIC", (
730 (self._import_standard_format, "https://ftp.lacnic.net/lacnic/dbase/lacnic.db.gz"),
731 (self._import_extended_format, "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"),
732 )),
733
734 # Réseaux IP Européens
735 ("RIPE", (
736 (self._import_standard_format, "https://ftp.ripe.net/ripe/dbase/split/ripe.db.inet6num.gz"),
737 (self._import_standard_format, "https://ftp.ripe.net/ripe/dbase/split/ripe.db.inetnum.gz"),
738 (self._import_standard_format, "https://ftp.ripe.net/ripe/dbase/split/ripe.db.aut-num.gz"),
739 (self._import_standard_format, "https://ftp.ripe.net/ripe/dbase/split/ripe.db.organisation.gz"),
740 )),
741 )
84b175e2 742
fa9220b0 743 # Fetch all valid country codes to check parsed networks against
d4697973
MT
744 countries = self.fetch_countries()
745
746 # Check if we have countries
747 if not countries:
748 log.error("Please import countries before importing any WHOIS data")
749 return 1
6ffd06b5 750
3ce28dea 751 # Iterate over all potential sources
8c2dbbc2 752 for name, feeds in sources:
d77d05d1 753 # Skip anything that should not be updated
8c2dbbc2 754 if ns.sources and not name in ns.sources:
d77d05d1
MT
755 continue
756
8c2dbbc2 757 try:
98d9079b 758 await self._process_source(name, feeds, countries)
002deb6b 759
8c2dbbc2
MT
760 # Log an error but continue if an exception occurs
761 except Exception as e:
762 log.error("Error processing source %s" % name, exc_info=True)
763 success = False
764
765 # Return a non-zero exit code for errors
766 return 0 if success else 1
767
98d9079b 768 async def _process_source(self, source, feeds, countries):
8c2dbbc2
MT
769 """
770 This function processes one source
771 """
772 # Wrap everything into one large transaction
773 with self.db.transaction():
774 # Remove all previously imported content
b8f3075b
MT
775 self.db.execute("DELETE FROM autnums WHERE source = %s", source)
776 self.db.execute("DELETE FROM networks WHERE source = %s", source)
777 self.db.execute("DELETE FROM network_geofeeds WHERE source = %s", source)
8c2dbbc2
MT
778
779 # Create some temporary tables to store parsed data
780 self.db.execute("""
781 CREATE TEMPORARY TABLE _autnums(number integer NOT NULL,
782 organization text NOT NULL, source text NOT NULL) ON COMMIT DROP;
783 CREATE UNIQUE INDEX _autnums_number ON _autnums(number);
784
785 CREATE TEMPORARY TABLE _organizations(handle text NOT NULL,
786 name text NOT NULL, source text NOT NULL) ON COMMIT DROP;
787 CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle);
788
789 CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text,
790 original_countries text[] NOT NULL, source text NOT NULL)
791 ON COMMIT DROP;
792 CREATE INDEX _rirdata_search ON _rirdata
793 USING BTREE(family(network), masklen(network));
794 CREATE UNIQUE INDEX _rirdata_network ON _rirdata(network);
795 """)
002deb6b 796
8c2dbbc2
MT
797 # Parse all feeds
798 for callback, url, *args in feeds:
799 # Retrieve the feed
335946c0 800 f = self.downloader.retrieve(url)
8c2dbbc2
MT
801
802 # Call the callback
c86209e2
MT
803 with self.db.pipeline():
804 await callback(source, countries, f, *args)
8c2dbbc2
MT
805
806 # Process all parsed networks from every RIR we happen to have access to,
807 # insert the largest network chunks into the networks table immediately...
808 families = self.db.query("""
809 SELECT DISTINCT
810 family(network) AS family
811 FROM
812 _rirdata
813 ORDER BY
814 family(network)
815 """,
816 )
817
818 for family in (row.family for row in families):
819 # Fetch the smallest mask length in our data set
820 smallest = self.db.get("""
821 SELECT
822 MIN(
823 masklen(network)
824 ) AS prefix
fa9220b0
MT
825 FROM
826 _rirdata
8c2dbbc2
MT
827 WHERE
828 family(network) = %s
829 """, family,
fa9220b0 830 )
3ce28dea 831
8c2dbbc2
MT
832 # Copy all networks
833 self.db.execute("""
834 INSERT INTO
835 networks
836 (
837 network,
838 country,
839 original_countries,
840 source
5bfa1bb4 841 )
8c2dbbc2
MT
842 SELECT
843 network,
844 country,
845 original_countries,
846 source
847 FROM
848 _rirdata
849 WHERE
850 masklen(network) = %s
851 AND
852 family(network) = %s
853 ON CONFLICT DO
854 NOTHING""",
855 smallest.prefix,
856 family,
857 )
858
859 # ... determine any other prefixes for this network family, ...
860 prefixes = self.db.query("""
861 SELECT
862 DISTINCT masklen(network) AS prefix
863 FROM
864 _rirdata
865 WHERE
866 family(network) = %s
867 ORDER BY
868 masklen(network) ASC
869 OFFSET 1
870 """, family,
871 )
5bfa1bb4 872
8c2dbbc2
MT
873 # ... and insert networks with this prefix in case they provide additional
874 # information (i. e. subnet of a larger chunk with a different country)
875 for prefix in (row.prefix for row in prefixes):
5bfa1bb4 876 self.db.execute("""
8c2dbbc2
MT
877 WITH candidates AS (
878 SELECT
879 _rirdata.network,
880 _rirdata.country,
881 _rirdata.original_countries,
882 _rirdata.source
883 FROM
884 _rirdata
885 WHERE
886 family(_rirdata.network) = %s
887 AND
888 masklen(_rirdata.network) = %s
889 ),
890 filtered AS (
891 SELECT
892 DISTINCT ON (c.network)
893 c.network,
894 c.country,
895 c.original_countries,
896 c.source,
897 masklen(networks.network),
898 networks.country AS parent_country
899 FROM
900 candidates c
901 LEFT JOIN
902 networks
903 ON
904 c.network << networks.network
905 ORDER BY
906 c.network,
907 masklen(networks.network) DESC NULLS LAST
5bfa1bb4 908 )
8c2dbbc2
MT
909 INSERT INTO
910 networks(network, country, original_countries, source)
5bfa1bb4
MT
911 SELECT
912 network,
913 country,
914 original_countries,
915 source
916 FROM
8c2dbbc2 917 filtered
5bfa1bb4 918 WHERE
8c2dbbc2
MT
919 parent_country IS NULL
920 OR
921 country <> parent_country
922 ON CONFLICT DO NOTHING
923 """, family, prefix,
5bfa1bb4 924 )
3ce28dea 925
8c2dbbc2
MT
926 self.db.execute("""
927 INSERT INTO
928 autnums
929 (
930 number,
931 name,
932 source
933 )
934 SELECT
935 _autnums.number,
936 _organizations.name,
937 _organizations.source
938 FROM
939 _autnums
940 JOIN
941 _organizations ON _autnums.organization = _organizations.handle
942 ON CONFLICT
943 (
944 number
945 )
946 DO UPDATE
947 SET name = excluded.name
948 """,
949 )
3ce28dea 950
98d9079b 951 async def _import_standard_format(self, source, countries, f, *args):
8c2dbbc2
MT
952 """
953 Imports a single standard format source feed
954 """
955 # Iterate over all blocks
8c3529c7 956 for block in iterate_over_blocks(f):
8c2dbbc2
MT
957 self._parse_block(block, source, countries)
958
98d9079b 959 async def _import_extended_format(self, source, countries, f, *args):
8c2dbbc2 960 # Iterate over all lines
8c3529c7 961 for line in iterate_over_lines(f):
c82e2eab 962 self._parse_line(line, source, countries)
8c2dbbc2 963
98d9079b 964 async def _import_arin_as_names(self, source, countries, f, *args):
fa5dccf1
MT
965 # Wrap the data to text
966 f = io.TextIOWrapper(f)
967
8c2dbbc2 968 # Walk through the file
932a7552 969 for line in csv.DictReader(f, dialect="arin"):
8c2dbbc2
MT
970 # Fetch status
971 status = line.get("Status")
972
973 # We are only interested in anything managed by ARIN
974 if not status == "Full Registry Services":
975 continue
976
977 # Fetch organization name
978 name = line.get("Org Name")
979
980 # Extract ASNs
981 first_asn = line.get("Start AS Number")
982 last_asn = line.get("End AS Number")
983
984 # Cast to a number
985 try:
986 first_asn = int(first_asn)
987 except TypeError as e:
988 log.warning("Could not parse ASN '%s'" % first_asn)
989 continue
990
991 try:
992 last_asn = int(last_asn)
993 except TypeError as e:
994 log.warning("Could not parse ASN '%s'" % last_asn)
995 continue
996
997 # Check if the range is valid
998 if last_asn < first_asn:
999 log.warning("Invalid ASN range %s-%s" % (first_asn, last_asn))
1000
1001 # Insert everything into the database
1002 for asn in range(first_asn, last_asn + 1):
1003 if not self._check_parsed_asn(asn):
1004 log.warning("Skipping invalid ASN %s" % asn)
1005 continue
0365119d 1006
3ce28dea 1007 self.db.execute("""
fa9220b0
MT
1008 INSERT INTO
1009 autnums
1010 (
1011 number,
1012 name,
1013 source
1014 )
8c2dbbc2
MT
1015 VALUES
1016 (
1017 %s, %s, %s
1018 )
fa9220b0
MT
1019 ON CONFLICT
1020 (
1021 number
1022 )
8c2dbbc2
MT
1023 DO NOTHING
1024 """, asn, name, "ARIN",
fa9220b0 1025 )
429a43d1 1026
bd341642
PM
1027 def _check_parsed_network(self, network):
1028 """
1029 Assistive function to detect and subsequently sort out parsed
1030 networks from RIR data (both Whois and so-called "extended sources"),
1031 which are or have...
1032
1033 (a) not globally routable (RFC 1918 space, et al.)
1034 (b) covering a too large chunk of the IP address space (prefix length
1035 is < 7 for IPv4 networks, and < 10 for IPv6)
1036 (c) "0.0.0.0" or "::" as a network address
bd341642
PM
1037
1038 This unfortunately is necessary due to brain-dead clutter across
1039 various RIR databases, causing mismatches and eventually disruptions.
1040
1041 We will return False in case a network is not suitable for adding
1042 it to our database, and True otherwise.
1043 """
d9b0a6ea
MT
1044 # Check input
1045 if isinstance(network, ipaddress.IPv6Network):
1046 pass
1047 elif isinstance(network, ipaddress.IPv4Network):
1048 pass
1049 else:
1050 raise ValueError("Invalid network: %s (type %s)" % (network, type(network)))
bd341642 1051
cb45f0d5 1052 # Ignore anything that isn't globally routable
bd341642 1053 if not network.is_global:
2ba6ed07 1054 log.debug("Skipping non-globally routable network: %s" % network)
bd341642
PM
1055 return False
1056
cb45f0d5
MT
1057 # Ignore anything that is unspecified IP range (See RFC 5735 for IPv4 or RFC 2373 for IPv6)
1058 elif network.is_unspecified:
1059 log.debug("Skipping unspecified network: %s" % network)
1060 return False
1061
c940dcf1
MT
1062 # IPv6
1063 if network.version == 6:
bd341642 1064 if network.prefixlen < 10:
2ba6ed07 1065 log.debug("Skipping too big IP chunk: %s" % network)
bd341642
PM
1066 return False
1067
c940dcf1
MT
1068 # IPv4
1069 elif network.version == 4:
1070 if network.prefixlen < 7:
1071 log.debug("Skipping too big IP chunk: %s" % network)
bd341642
PM
1072 return False
1073
bd341642
PM
1074 # In case we have made it here, the network is considered to
1075 # be suitable for libloc consumption...
1076 return True
1077
43fe570c
PM
1078 def _check_parsed_asn(self, asn):
1079 """
1080 Assistive function to filter Autonomous System Numbers not being suitable
1081 for adding to our database. Returns False in such cases, and True otherwise.
1082 """
1083
1084 for start, end in VALID_ASN_RANGES:
1085 if start <= asn and end >= asn:
1086 return True
1087
1088 log.info("Supplied ASN %s out of publicly routable ASN ranges" % asn)
1089 return False
1090
c60e31ea
MT
1091 def _check_geofeed_url(self, url):
1092 """
1093 This function checks if a Geofeed URL is valid.
1094
1095 If so, it returns the normalized URL which should be stored instead of
1096 the original one.
1097 """
1098 # Parse the URL
1099 try:
1100 url = urllib.parse.urlparse(url)
1101 except ValueError as e:
1102 log.warning("Invalid URL %s: %s" % (url, e))
1103 return
1104
1105 # Make sure that this is a HTTPS URL
1106 if not url.scheme == "https":
1107 log.warning("Skipping Geofeed URL that is not using HTTPS: %s" \
1108 % url.geturl())
1109 return
1110
1111 # Normalize the URL and convert it back
1112 return url.geturl()
1113
d4697973 1114 def _parse_block(self, block, source_key, countries):
6ffd06b5
MT
1115 # Get first line to find out what type of block this is
1116 line = block[0]
1117
6ffd06b5 1118 # aut-num
429a43d1 1119 if line.startswith("aut-num:"):
28b08385 1120 return self._parse_autnum_block(block, source_key)
6ffd06b5 1121
aadac4c5
PM
1122 # inetnum
1123 if line.startswith("inet6num:") or line.startswith("inetnum:"):
d4697973 1124 return self._parse_inetnum_block(block, source_key, countries)
aadac4c5 1125
6ffd06b5
MT
1126 # organisation
1127 elif line.startswith("organisation:"):
28b08385 1128 return self._parse_org_block(block, source_key)
6ffd06b5 1129
28b08385 1130 def _parse_autnum_block(self, block, source_key):
6ffd06b5
MT
1131 autnum = {}
1132 for line in block:
1133 # Split line
1134 key, val = split_line(line)
1135
1136 if key == "aut-num":
1137 m = re.match(r"^(AS|as)(\d+)", val)
1138 if m:
1139 autnum["asn"] = m.group(2)
1140
0365119d 1141 elif key == "org":
e7d612e5 1142 autnum[key] = val.upper()
6ffd06b5 1143
426e0bee
PM
1144 elif key == "descr":
1145 # Save the first description line as well...
1146 if not key in autnum:
1147 autnum[key] = val
1148
6ffd06b5 1149 # Skip empty objects
426e0bee 1150 if not autnum or not "asn" in autnum:
6ffd06b5
MT
1151 return
1152
426e0bee
PM
1153 # Insert a dummy organisation handle into our temporary organisations
1154 # table in case the AS does not have an organisation handle set, but
1155 # has a description (a quirk often observed in APNIC area), so we can
1156 # later display at least some string for this AS.
1157 if not "org" in autnum:
1158 if "descr" in autnum:
1159 autnum["org"] = "LIBLOC-%s-ORGHANDLE" % autnum.get("asn")
1160
1161 self.db.execute("INSERT INTO _organizations(handle, name, source) \
1162 VALUES(%s, %s, %s) ON CONFLICT (handle) DO NOTHING",
1163 autnum.get("org"), autnum.get("descr"), source_key,
1164 )
1165 else:
1166 log.warning("ASN %s neither has an organisation handle nor a description line set, omitting" % \
1167 autnum.get("asn"))
1168 return
1169
6ffd06b5 1170 # Insert into database
28b08385
PM
1171 self.db.execute("INSERT INTO _autnums(number, organization, source) \
1172 VALUES(%s, %s, %s) ON CONFLICT (number) DO UPDATE SET \
0365119d 1173 organization = excluded.organization",
28b08385 1174 autnum.get("asn"), autnum.get("org"), source_key,
6ffd06b5
MT
1175 )
1176
d4697973 1177 def _parse_inetnum_block(self, block, source_key, countries):
aadac4c5
PM
1178 inetnum = {}
1179 for line in block:
aadac4c5
PM
1180 # Split line
1181 key, val = split_line(line)
1182
84187ab5
PM
1183 # Filter any inetnum records which are only referring to IP space
1184 # not managed by that specific RIR...
1185 if key == "netname":
5254e5fc 1186 if re.match(r"^(ERX-NETBLOCK|(AFRINIC|ARIN|LACNIC|RIPE)-CIDR-BLOCK|IANA-NETBLOCK-\d{1,3}|NON-RIPE-NCC-MANAGED-ADDRESS-BLOCK|STUB-[\d-]{3,}SLASH\d{1,2})", val.strip()):
2ba6ed07 1187 log.debug("Skipping record indicating historic/orphaned data: %s" % val.strip())
84187ab5
PM
1188 return
1189
aadac4c5
PM
1190 if key == "inetnum":
1191 start_address, delim, end_address = val.partition("-")
1192
1193 # Strip any excess space
1194 start_address, end_address = start_address.rstrip(), end_address.strip()
1195
845da577
PM
1196 # Handle "inetnum" formatting in LACNIC DB (e.g. "24.152.8/22" instead of "24.152.8.0/22")
1197 if start_address and not (delim or end_address):
1198 try:
1199 start_address = ipaddress.ip_network(start_address, strict=False)
1200 except ValueError:
1201 start_address = start_address.split("/")
38d0faea 1202 ldigits = start_address[0].count(".")
845da577
PM
1203
1204 # How many octets do we need to add?
1205 # (LACNIC does not seem to have a /8 or greater assigned, so the following should suffice.)
2ad36187 1206 if ldigits == 1:
845da577 1207 start_address = start_address[0] + ".0.0/" + start_address[1]
2ad36187 1208 elif ldigits == 2:
845da577
PM
1209 start_address = start_address[0] + ".0/" + start_address[1]
1210 else:
1211 log.warning("Could not recover IPv4 address from line in LACNIC DB format: %s" % line)
1212 return
1213
1214 try:
1215 start_address = ipaddress.ip_network(start_address, strict=False)
1216 except ValueError:
1217 log.warning("Could not parse line in LACNIC DB format: %s" % line)
1218 return
1219
1220 # Enumerate first and last IP address of this network
1221 end_address = start_address[-1]
1222 start_address = start_address[0]
1223
1224 else:
1225 # Convert to IP address
1226 try:
1227 start_address = ipaddress.ip_address(start_address)
1228 end_address = ipaddress.ip_address(end_address)
1229 except ValueError:
1230 log.warning("Could not parse line: %s" % line)
1231 return
aadac4c5 1232
1814283b 1233 inetnum["inetnum"] = list(ipaddress.summarize_address_range(start_address, end_address))
aadac4c5
PM
1234
1235 elif key == "inet6num":
1814283b 1236 inetnum[key] = [ipaddress.ip_network(val, strict=False)]
aadac4c5
PM
1237
1238 elif key == "country":
5487ee0d 1239 cc = val.upper()
b6b2b331 1240
30d32299 1241 # Ignore certain country codes
5487ee0d
MT
1242 if cc in IGNORED_COUNTRIES:
1243 log.debug("Ignoring country code '%s'" % cc)
30d32299
MT
1244 continue
1245
afc966db
MT
1246 # Translate country codes
1247 try:
5487ee0d 1248 cc = TRANSLATED_COUNTRIES[cc]
afc966db
MT
1249 except KeyError:
1250 pass
01e5f0ff 1251
a9e72406
MT
1252 # Do we know this country?
1253 if not cc in countries:
1254 log.warning("Skipping invalid country code '%s'" % cc)
1255 continue
1256
5487ee0d
MT
1257 try:
1258 inetnum[key].append(cc)
1259 except KeyError:
1260 inetnum[key] = [cc]
aadac4c5 1261
183b2f74
MT
1262 # Parse the geofeed attribute
1263 elif key == "geofeed":
1264 inetnum["geofeed"] = val
1265
1266 # Parse geofeed when used as a remark
45c0536c
MT
1267 elif key == "remarks":
1268 m = re.match(r"^(?:Geofeed)\s+(https://.*)", val)
183b2f74
MT
1269 if m:
1270 inetnum["geofeed"] = m.group(1)
1271
aadac4c5 1272 # Skip empty objects
39bfa052 1273 if not inetnum:
aadac4c5
PM
1274 return
1275
1814283b
PM
1276 # Iterate through all networks enumerated from above, check them for plausibility and insert
1277 # them into the database, if _check_parsed_network() succeeded
1278 for single_network in inetnum.get("inet6num") or inetnum.get("inetnum"):
39bfa052
MT
1279 if not self._check_parsed_network(single_network):
1280 continue
1281
1282 # Fetch the countries or use a list with an empty country
1283 countries = inetnum.get("country", [None])
1284
1285 # Insert the network into the database but only use the first country code
1286 for cc in countries:
1287 self.db.execute("""
1288 INSERT INTO
1289 _rirdata
1290 (
1291 network,
1292 country,
1293 original_countries,
1294 source
1295 )
1296 VALUES
1297 (
1298 %s, %s, %s, %s
1299 )
1300 ON CONFLICT (network)
1301 DO UPDATE SET country = excluded.country
1302 """, "%s" % single_network, cc, [cc for cc in countries if cc], source_key,
1814283b 1303 )
aadac4c5 1304
39bfa052
MT
1305 # If there are more than one country, we will only use the first one
1306 break
183b2f74 1307
39bfa052
MT
1308 # Update any geofeed information
1309 geofeed = inetnum.get("geofeed", None)
1310 if geofeed:
b8f3075b 1311 self._parse_geofeed(source_key, geofeed, single_network)
39bfa052 1312
b8f3075b 1313 def _parse_geofeed(self, source, url, single_network):
c60e31ea
MT
1314 # Check the URL
1315 url = self._check_geofeed_url(url)
1316 if not url:
da832d44
MT
1317 return
1318
da832d44
MT
1319 # Store/update any geofeeds
1320 self.db.execute("""
1321 INSERT INTO
b8f3075b
MT
1322 network_geofeeds
1323 (
1324 network,
1325 url,
1326 source
1327 )
1328 VALUES
1329 (
1330 %s, %s, %s
da832d44 1331 )
1042baa7
MT
1332 ON CONFLICT
1333 (
2b96ed27 1334 network, url
1042baa7
MT
1335 )
1336 DO UPDATE SET
1042baa7
MT
1337 source = excluded.source
1338 """, "%s" % single_network, url, source,
da832d44
MT
1339 )
1340
28b08385 1341 def _parse_org_block(self, block, source_key):
6ffd06b5
MT
1342 org = {}
1343 for line in block:
1344 # Split line
1345 key, val = split_line(line)
1346
e7d612e5
PM
1347 if key == "organisation":
1348 org[key] = val.upper()
1349 elif key == "org-name":
6ffd06b5
MT
1350 org[key] = val
1351
1352 # Skip empty objects
1353 if not org:
1354 return
1355
28b08385
PM
1356 self.db.execute("INSERT INTO _organizations(handle, name, source) \
1357 VALUES(%s, %s, %s) ON CONFLICT (handle) DO \
0365119d 1358 UPDATE SET name = excluded.name",
28b08385 1359 org.get("organisation"), org.get("org-name"), source_key,
6ffd06b5
MT
1360 )
1361
4b931f6c 1362 def _parse_line(self, line, source_key, validcountries=None):
429a43d1
MT
1363 # Skip version line
1364 if line.startswith("2"):
1365 return
6ffd06b5 1366
429a43d1
MT
1367 # Skip comments
1368 if line.startswith("#"):
1369 return
6ffd06b5 1370
429a43d1
MT
1371 try:
1372 registry, country_code, type, line = line.split("|", 3)
1373 except:
1374 log.warning("Could not parse line: %s" % line)
1375 return
6ffd06b5 1376
9a9b0698
MT
1377 # Skip ASN
1378 if type == "asn":
1379 return
1380
4b931f6c 1381 # Skip any unknown protocols
9a9b0698 1382 elif not type in ("ipv6", "ipv4"):
4b931f6c
MT
1383 log.warning("Unknown IP protocol '%s'" % type)
1384 return
1385
84b175e2
PM
1386 # Skip any lines that are for stats only or do not have a country
1387 # code at all (avoids log spam below)
1388 if not country_code or country_code == '*':
1389 return
1390
1391 # Skip objects with unknown country codes
1392 if validcountries and country_code not in validcountries:
1393 log.warning("Skipping line with bogus country '%s': %s" % \
1394 (country_code, line))
6ffd06b5
MT
1395 return
1396
429a43d1
MT
1397 try:
1398 address, prefix, date, status, organization = line.split("|")
1399 except ValueError:
1400 organization = None
1401
1402 # Try parsing the line without organization
1403 try:
1404 address, prefix, date, status = line.split("|")
1405 except ValueError:
1406 log.warning("Unhandled line format: %s" % line)
1407 return
1408
1409 # Skip anything that isn't properly assigned
1410 if not status in ("assigned", "allocated"):
1411 return
1412
1413 # Cast prefix into an integer
1414 try:
1415 prefix = int(prefix)
1416 except:
1417 log.warning("Invalid prefix: %s" % prefix)
7177031f 1418 return
429a43d1
MT
1419
1420 # Fix prefix length for IPv4
1421 if type == "ipv4":
1422 prefix = 32 - int(math.log(prefix, 2))
1423
1424 # Try to parse the address
1425 try:
1426 network = ipaddress.ip_network("%s/%s" % (address, prefix), strict=False)
1427 except ValueError:
1428 log.warning("Invalid IP address: %s" % address)
1429 return
1430
bd341642
PM
1431 if not self._check_parsed_network(network):
1432 return
1433
4b931f6c
MT
1434 self.db.execute("""
1435 INSERT INTO
1436 networks
1437 (
1438 network,
1439 country,
1440 original_countries,
1441 source
1442 )
1443 VALUES
1444 (
1445 %s, %s, %s, %s
1446 )
1447 ON CONFLICT (network)
1448 DO UPDATE SET country = excluded.country
60f09cda 1449 """, "%s" % network, country_code, [country_code], source_key,
6ffd06b5
MT
1450 )
1451
98d9079b 1452 async def handle_update_announcements(self, ns):
83d61c46
MT
1453 server = ns.server[0]
1454
22d8d199
MT
1455 with self.db.transaction():
1456 if server.startswith("/"):
98d9079b 1457 await self._handle_update_announcements_from_bird(server)
22d8d199
MT
1458
1459 # Purge anything we never want here
1460 self.db.execute("""
1461 -- Delete default routes
1462 DELETE FROM announcements WHERE network = '::/0' OR network = '0.0.0.0/0';
1463
1464 -- Delete anything that is not global unicast address space
1465 DELETE FROM announcements WHERE family(network) = 6 AND NOT network <<= '2000::/3';
1466
1467 -- DELETE "current network" address space
1468 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '0.0.0.0/8';
1469
1470 -- DELETE local loopback address space
1471 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '127.0.0.0/8';
1472
1473 -- DELETE RFC 1918 address space
1474 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '10.0.0.0/8';
1475 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '172.16.0.0/12';
1476 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.168.0.0/16';
1477
1478 -- DELETE test, benchmark and documentation address space
1479 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.0.0/24';
1480 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.0.2.0/24';
1481 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.18.0.0/15';
1482 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '198.51.100.0/24';
1483 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '203.0.113.0/24';
1484
1485 -- DELETE CGNAT address space (RFC 6598)
1486 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '100.64.0.0/10';
1487
1488 -- DELETE link local address space
1489 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '169.254.0.0/16';
1490
b4d5b2a6 1491 -- DELETE IPv6 to IPv4 (6to4) address space (RFC 3068)
22d8d199 1492 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '192.88.99.0/24';
b4d5b2a6 1493 DELETE FROM announcements WHERE family(network) = 6 AND network <<= '2002::/16';
22d8d199
MT
1494
1495 -- DELETE multicast and reserved address space
1496 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '224.0.0.0/4';
1497 DELETE FROM announcements WHERE family(network) = 4 AND network <<= '240.0.0.0/4';
1498
1499 -- Delete networks that are too small to be in the global routing table
1500 DELETE FROM announcements WHERE family(network) = 6 AND masklen(network) > 48;
1501 DELETE FROM announcements WHERE family(network) = 4 AND masklen(network) > 24;
1502
1503 -- Delete any non-public or reserved ASNs
1504 DELETE FROM announcements WHERE NOT (
1505 (autnum >= 1 AND autnum <= 23455)
1506 OR
1507 (autnum >= 23457 AND autnum <= 64495)
1508 OR
1509 (autnum >= 131072 AND autnum <= 4199999999)
1510 );
1511
1512 -- Delete everything that we have not seen for 14 days
1513 DELETE FROM announcements WHERE last_seen_at <= CURRENT_TIMESTAMP - INTERVAL '14 days';
1514 """)
1515
98d9079b 1516 async def _handle_update_announcements_from_bird(self, server):
22d8d199 1517 # Pre-compile the regular expression for faster searching
35e5f041 1518 route = re.compile(b"^\s(.+?)\s+.+?\[(?:AS(.*?))?.\]$")
22d8d199
MT
1519
1520 log.info("Requesting routing table from Bird (%s)" % server)
1521
35e5f041
MT
1522 aggregated_networks = []
1523
22d8d199
MT
1524 # Send command to list all routes
1525 for line in self._bird_cmd(server, "show route"):
1526 m = route.match(line)
1527 if not m:
b767db8d
MT
1528 # Skip empty lines
1529 if not line:
1530 pass
1531
45fa97cd 1532 # Ignore any header lines with the name of the routing table
b767db8d
MT
1533 elif line.startswith(b"Table"):
1534 pass
45fa97cd
MT
1535
1536 # Log anything else
b767db8d
MT
1537 else:
1538 log.debug("Could not parse line: %s" % line.decode())
1539
22d8d199
MT
1540 continue
1541
1542 # Fetch the extracted network and ASN
1543 network, autnum = m.groups()
1544
35e5f041
MT
1545 # Decode into strings
1546 if network:
1547 network = network.decode()
1548 if autnum:
1549 autnum = autnum.decode()
1550
1551 # Collect all aggregated networks
1552 if not autnum:
1553 log.debug("%s is an aggregated network" % network)
1554 aggregated_networks.append(network)
1555 continue
1556
22d8d199
MT
1557 # Insert it into the database
1558 self.db.execute("INSERT INTO announcements(network, autnum) \
1559 VALUES(%s, %s) ON CONFLICT (network) DO \
1560 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
35e5f041 1561 network, autnum,
22d8d199
MT
1562 )
1563
35e5f041
MT
1564 # Process any aggregated networks
1565 for network in aggregated_networks:
1566 log.debug("Processing aggregated network %s" % network)
1567
1568 # Run "show route all" for each network
1569 for line in self._bird_cmd(server, "show route %s all" % network):
1570 # Try finding the path
1571 m = re.match(b"\s+BGP\.as_path:.* (\d+) {\d+}$", line)
1572 if m:
1573 # Select the last AS number in the path
1574 autnum = m.group(1).decode()
1575
1576 # Insert it into the database
1577 self.db.execute("INSERT INTO announcements(network, autnum) \
1578 VALUES(%s, %s) ON CONFLICT (network) DO \
1579 UPDATE SET autnum = excluded.autnum, last_seen_at = CURRENT_TIMESTAMP",
1580 network, autnum,
1581 )
1582
1583 # We don't need to process any more
1584 break
1585
22d8d199
MT
1586 def _bird_cmd(self, socket_path, command):
1587 # Connect to the socket
1588 s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
1589 s.connect(socket_path)
cedee656 1590
22d8d199
MT
1591 # Allocate some buffer
1592 buffer = b""
83d61c46 1593
bbcefb38
MT
1594 log.debug("Sending Bird command: %s" % command)
1595
22d8d199
MT
1596 # Send the command
1597 s.send(b"%s\n" % command.encode())
209c04b6 1598
22d8d199
MT
1599 while True:
1600 # Fill up the buffer
1601 buffer += s.recv(4096)
209c04b6 1602
22d8d199
MT
1603 while True:
1604 # Search for the next newline
1605 pos = buffer.find(b"\n")
209c04b6 1606
22d8d199
MT
1607 # If we cannot find one, we go back and read more data
1608 if pos <= 0:
1609 break
209c04b6 1610
22d8d199
MT
1611 # Cut after the newline character
1612 pos += 1
b89cee80 1613
22d8d199
MT
1614 # Split the line we want and keep the rest in buffer
1615 line, buffer = buffer[:pos], buffer[pos:]
83d61c46 1616
bbcefb38
MT
1617 # Try parsing any status lines
1618 if len(line) > 4 and line[:4].isdigit() and line[4] in (32, 45):
1619 code, delim, line = int(line[:4]), line[4], line[5:]
1620
1621 log.debug("Received response code %s from bird" % code)
1622
1623 # End of output
1624 if code == 0:
1625 return
1626
1627 # Ignore hello line
1628 elif code == 1:
1629 continue
83d61c46 1630
22d8d199
MT
1631 # Otherwise return the line
1632 yield line
83d61c46 1633
98d9079b 1634 async def handle_update_geofeeds(self, ns):
ef8d1f67
MT
1635 # Sync geofeeds
1636 with self.db.transaction():
1637 # Delete all geofeeds which are no longer linked
1638 self.db.execute("""
1639 DELETE FROM
1640 geofeeds
1641 WHERE
3efe0297 1642 geofeeds.url NOT IN (
ef8d1f67 1643 SELECT
3efe0297 1644 network_geofeeds.url
ef8d1f67
MT
1645 FROM
1646 network_geofeeds
3efe0297
MT
1647
1648 UNION
1649
1650 SELECT
1651 geofeed_overrides.url
1652 FROM
1653 geofeed_overrides
1654 )
1655 """,
ef8d1f67
MT
1656 )
1657
1658 # Copy all geofeeds
1659 self.db.execute("""
3efe0297
MT
1660 WITH all_geofeeds AS (
1661 SELECT
1662 network_geofeeds.url
1663 FROM
1664 network_geofeeds
1665
1666 UNION
1667
1668 SELECT
1669 geofeed_overrides.url
1670 FROM
1671 geofeed_overrides
1672 )
ef8d1f67 1673 INSERT INTO
3efe0297
MT
1674 geofeeds
1675 (
1676 url
1677 )
ef8d1f67
MT
1678 SELECT
1679 url
1680 FROM
3efe0297 1681 all_geofeeds
ef8d1f67
MT
1682 ON CONFLICT (url)
1683 DO NOTHING
1684 """,
1685 )
1686
0fcdd689
MT
1687 # Fetch all Geofeeds that require an update
1688 geofeeds = self.db.query("""
1689 SELECT
1690 id,
1691 url
1692 FROM
1693 geofeeds
1694 WHERE
1695 updated_at IS NULL
1696 OR
1697 updated_at <= CURRENT_TIMESTAMP - INTERVAL '1 week'
1698 ORDER BY
1699 id
1700 """)
1701
98d9079b
MT
1702 ratelimiter = asyncio.Semaphore(32)
1703
430c51c2 1704 # Update all geofeeds
98d9079b
MT
1705 async with asyncio.TaskGroup() as tasks:
1706 for geofeed in geofeeds:
1707 task = tasks.create_task(
1708 self._fetch_geofeed(ratelimiter, geofeed),
1709 )
0fcdd689 1710
a4e50c7a
MT
1711 # Delete data from any feeds that did not update in the last two weeks
1712 with self.db.transaction():
1713 self.db.execute("""
1714 DELETE FROM
1715 geofeed_networks
1716 WHERE
1717 geofeed_networks.geofeed_id IN (
1718 SELECT
1719 geofeeds.id
1720 FROM
1721 geofeeds
1722 WHERE
1723 updated_at IS NULL
1724 OR
1725 updated_at <= CURRENT_TIMESTAMP - INTERVAL '2 weeks'
1726 )
1727 """)
1728
98d9079b
MT
1729 async def _fetch_geofeed(self, ratelimiter, geofeed):
1730 async with ratelimiter:
1731 log.debug("Fetching Geofeed %s" % geofeed.url)
0fcdd689 1732
98d9079b
MT
1733 with self.db.transaction():
1734 # Open the URL
1735 try:
1736 # Send the request
1737 f = await asyncio.to_thread(
776226b6
MT
1738 self.downloader.retrieve,
1739
1740 # Fetch the feed by its URL
1741 geofeed.url,
1742
1743 # Send some extra headers
98d9079b
MT
1744 headers={
1745 "User-Agent" : "location/%s" % location.__version__,
1746
1747 # We expect some plain text file in CSV format
1748 "Accept" : "text/csv, text/plain",
1749 },
1750
1751 # Don't wait longer than 10 seconds for a response
776226b6 1752 timeout=10,
98d9079b 1753 )
0fcdd689 1754
98d9079b
MT
1755 # Remove any previous data
1756 self.db.execute("DELETE FROM geofeed_networks \
1757 WHERE geofeed_id = %s", geofeed.id)
0fcdd689 1758
98d9079b 1759 lineno = 0
0fcdd689 1760
98d9079b 1761 # Read the output line by line
c86209e2
MT
1762 with self.db.pipeline():
1763 for line in f:
1764 lineno += 1
0fcdd689 1765
c86209e2
MT
1766 try:
1767 line = line.decode()
5e632485 1768
c86209e2
MT
1769 # Ignore any lines we cannot decode
1770 except UnicodeDecodeError:
1771 log.debug("Could not decode line %s in %s" \
1772 % (lineno, geofeed.url))
1773 continue
56f147ce 1774
c86209e2
MT
1775 # Strip any newline
1776 line = line.rstrip()
0fcdd689 1777
c86209e2
MT
1778 # Skip empty lines
1779 if not line:
1780 continue
0fcdd689 1781
c86209e2
MT
1782 # Skip comments
1783 elif line.startswith("#"):
1784 continue
990b3ad4 1785
c86209e2
MT
1786 # Try to parse the line
1787 try:
1788 fields = line.split(",", 5)
1789 except ValueError:
1790 log.debug("Could not parse line: %s" % line)
1791 continue
0fcdd689 1792
c86209e2
MT
1793 # Check if we have enough fields
1794 if len(fields) < 4:
1795 log.debug("Not enough fields in line: %s" % line)
1796 continue
0fcdd689 1797
c86209e2
MT
1798 # Fetch all fields
1799 network, country, region, city, = fields[:4]
0fcdd689 1800
c86209e2
MT
1801 # Try to parse the network
1802 try:
1803 network = ipaddress.ip_network(network, strict=False)
1804 except ValueError:
1805 log.debug("Could not parse network: %s" % network)
1806 continue
1807
1808 # Strip any excess whitespace from country codes
1809 country = country.strip()
1810
1811 # Make the country code uppercase
1812 country = country.upper()
1813
1814 # Check the country code
1815 if not country:
1816 log.debug("Empty country code in Geofeed %s line %s" \
1817 % (geofeed.url, lineno))
1818 continue
1819
1820 elif not location.country_code_is_valid(country):
1821 log.debug("Invalid country code in Geofeed %s:%s: %s" \
1822 % (geofeed.url, lineno, country))
1823 continue
1824
1825 # Write this into the database
1826 self.db.execute("""
1827 INSERT INTO
1828 geofeed_networks (
1829 geofeed_id,
1830 network,
1831 country,
1832 region,
1833 city
1834 )
1835 VALUES (%s, %s, %s, %s, %s)""",
1836 geofeed.id,
1837 "%s" % network,
1838 country,
1839 region,
1840 city,
1841 )
0fcdd689 1842
98d9079b
MT
1843 # Catch any HTTP errors
1844 except urllib.request.HTTPError as e:
1845 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1846 WHERE id = %s", e.code, "%s" % e, geofeed.id)
4ed16a14 1847
98d9079b
MT
1848 # Remove any previous data when the feed has been deleted
1849 if e.code == 404:
1850 self.db.execute("DELETE FROM geofeed_networks \
1851 WHERE geofeed_id = %s", geofeed.id)
72908243 1852
98d9079b
MT
1853 # Catch any other errors and connection timeouts
1854 except (http.client.InvalidURL, urllib.request.URLError, TimeoutError) as e:
1855 log.debug("Could not fetch URL %s: %s" % (geofeed.url, e))
5e632485 1856
98d9079b
MT
1857 self.db.execute("UPDATE geofeeds SET status = %s, error = %s \
1858 WHERE id = %s", 599, "%s" % e, geofeed.id)
0fcdd689 1859
98d9079b
MT
1860 # Mark the geofeed as updated
1861 else:
81b28603 1862 self.db.execute("""
98d9079b
MT
1863 UPDATE
1864 geofeeds
1865 SET
1866 updated_at = CURRENT_TIMESTAMP,
1867 status = NULL,
1868 error = NULL
1869 WHERE
1870 id = %s""",
81b28603 1871 geofeed.id,
81b28603 1872 )
0fcdd689 1873
98d9079b 1874 async def handle_update_overrides(self, ns):
d7fc3057 1875 with self.db.transaction():
667ad134
MT
1876 # Drop any previous content
1877 self.db.execute("TRUNCATE TABLE autnum_overrides")
1878 self.db.execute("TRUNCATE TABLE network_overrides")
d7fc3057 1879
f5a4557d
MT
1880 # Remove all Geofeeds
1881 self.db.execute("DELETE FROM network_geofeeds WHERE source = %s", "overrides")
1882
d7fc3057
MT
1883 for file in ns.files:
1884 log.info("Reading %s..." % file)
1885
1886 with open(file, "rb") as f:
8c3529c7 1887 for type, block in read_blocks(f):
d7fc3057
MT
1888 if type == "net":
1889 network = block.get("net")
1890 # Try to parse and normalise the network
1891 try:
1892 network = ipaddress.ip_network(network, strict=False)
1893 except ValueError as e:
1894 log.warning("Invalid IP network: %s: %s" % (network, e))
1895 continue
1896
94dfab8c
MT
1897 # Prevent that we overwrite all networks
1898 if network.prefixlen == 0:
1899 log.warning("Skipping %s: You cannot overwrite default" % network)
1900 continue
1901
d7fc3057 1902 self.db.execute("""
667ad134
MT
1903 INSERT INTO
1904 network_overrides
1905 (
d7fc3057
MT
1906 network,
1907 country,
1908 is_anonymous_proxy,
1909 is_satellite_provider,
e17e804e
PM
1910 is_anycast,
1911 is_drop
667ad134
MT
1912 )
1913 VALUES
1914 (
1915 %s, %s, %s, %s, %s, %s
1916 )
1917 ON CONFLICT (network) DO NOTHING
1918 """,
d7fc3057
MT
1919 "%s" % network,
1920 block.get("country"),
28d29b7c
MT
1921 self._parse_bool(block, "is-anonymous-proxy"),
1922 self._parse_bool(block, "is-satellite-provider"),
1923 self._parse_bool(block, "is-anycast"),
e17e804e 1924 self._parse_bool(block, "drop"),
d7fc3057
MT
1925 )
1926
f476cdfd
MT
1927 elif type == "aut-num":
1928 autnum = block.get("aut-num")
d7fc3057
MT
1929
1930 # Check if AS number begins with "AS"
1931 if not autnum.startswith("AS"):
1932 log.warning("Invalid AS number: %s" % autnum)
1933 continue
1934
1935 # Strip "AS"
1936 autnum = autnum[2:]
1937
1938 self.db.execute("""
667ad134
MT
1939 INSERT INTO
1940 autnum_overrides
1941 (
d7fc3057
MT
1942 number,
1943 name,
bd1aa6a1 1944 country,
d7fc3057
MT
1945 is_anonymous_proxy,
1946 is_satellite_provider,
e17e804e
PM
1947 is_anycast,
1948 is_drop
667ad134
MT
1949 )
1950 VALUES
1951 (
1952 %s, %s, %s, %s, %s, %s, %s
1953 )
1954 ON CONFLICT (number) DO NOTHING
1955 """,
bd1aa6a1
MT
1956 autnum,
1957 block.get("name"),
1958 block.get("country"),
28d29b7c
MT
1959 self._parse_bool(block, "is-anonymous-proxy"),
1960 self._parse_bool(block, "is-satellite-provider"),
1961 self._parse_bool(block, "is-anycast"),
e17e804e 1962 self._parse_bool(block, "drop"),
d7fc3057
MT
1963 )
1964
3efe0297
MT
1965 # Geofeeds
1966 elif type == "geofeed":
f5a4557d
MT
1967 networks = []
1968
1969 # Fetch the URL
3efe0297
MT
1970 url = block.get("geofeed")
1971
f5a4557d
MT
1972 # Fetch permitted networks
1973 for n in block.get("network", []):
1974 try:
1975 n = ipaddress.ip_network(n)
1976 except ValueError as e:
1977 log.warning("Ignoring invalid network %s: %s" % (n, e))
1978 continue
1979
1980 networks.append(n)
1981
1982 # Log a warning if not networks have been permitted
1983 if not networks:
1984 log.warning("Geofeed %s is not permitted for any networks. Ignoring." % url)
1985 continue
1986
c60e31ea
MT
1987 # Check the URL
1988 url = self._check_geofeed_url(url)
1989 if not url:
2aeec4ce
MT
1990 continue
1991
f5a4557d 1992 # Store the Geofeed URL
3efe0297
MT
1993 self.db.execute("""
1994 INSERT INTO
f5a4557d 1995 geofeeds
3efe0297
MT
1996 (
1997 url
1998 )
1999 VALUES
2000 (
2001 %s
2002 )
2003 ON CONFLICT (url) DO NOTHING
2004 """, url,
2005 )
2006
f5a4557d
MT
2007 # Store all permitted networks
2008 self.db.executemany("""
2009 INSERT INTO
2010 network_geofeeds
2011 (
2012 network,
2013 url,
2014 source
2015 )
2016 VALUES
2017 (
2018 %s, %s, %s
2019 )
1042baa7
MT
2020 ON CONFLICT
2021 (
2b96ed27 2022 network, url
1042baa7
MT
2023 )
2024 DO UPDATE SET
1042baa7 2025 source = excluded.source
f5a4557d
MT
2026 """, (("%s" % n, url, "overrides") for n in networks),
2027 )
2028
d7fc3057 2029 else:
03d24a9b 2030 log.warning("Unsupported type: %s" % type)
d7fc3057 2031
98d9079b 2032 async def handle_update_feeds(self, ns):
93aad7f7
MT
2033 """
2034 Update any third-party feeds
2035 """
a775372b
MT
2036 success = True
2037
a775372b
MT
2038 feeds = (
2039 # AWS IP Ranges
2040 ("AWS-IP-RANGES", self._import_aws_ip_ranges, "https://ip-ranges.amazonaws.com/ip-ranges.json"),
377cead4
MT
2041
2042 # Spamhaus DROP
2043 ("SPAMHAUS-DROP", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/drop.txt"),
2044 ("SPAMHAUS-EDROP", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/edrop.txt"),
2045 ("SPAMHAUS-DROPV6", self._import_spamhaus_drop, "https://www.spamhaus.org/drop/dropv6.txt"),
5dcda5c8
MT
2046
2047 # Spamhaus ASNDROP
2048 ("SPAMHAUS-ASNDROP", self._import_spamhaus_asndrop, "https://www.spamhaus.org/drop/asndrop.json"),
a775372b 2049 )
659c2810
MT
2050
2051 # Drop any data from feeds that we don't support (any more)
2052 with self.db.transaction():
2053 # Fetch the names of all feeds we support
2054 sources = [name for name, *rest in feeds]
2055
2056 self.db.execute("DELETE FROM autnum_feeds WHERE NOT source = ANY(%s)", sources)
2057 self.db.execute("DELETE FROM network_feeds WHERE NOT source = ANY(%s)", sources)
a775372b
MT
2058
2059 # Walk through all feeds
2060 for name, callback, url, *args in feeds:
f7f8e714
MT
2061 # Skip any feeds that were not requested on the command line
2062 if ns.feeds and not name in ns.feeds:
2063 continue
2064
a775372b 2065 try:
98d9079b 2066 await self._process_feed(name, callback, url, *args)
a775372b
MT
2067
2068 # Log an error but continue if an exception occurs
2069 except Exception as e:
2070 log.error("Error processing feed '%s': %s" % (name, e))
2071 success = False
93aad7f7 2072
a775372b
MT
2073 # Return status
2074 return 0 if success else 1
dcef2ba4 2075
98d9079b 2076 async def _process_feed(self, name, callback, url, *args):
a775372b
MT
2077 """
2078 Processes one feed
2079 """
2080 # Open the URL
335946c0 2081 f = self.downloader.retrieve(url)
85e44d9a 2082
a775372b
MT
2083 with self.db.transaction():
2084 # Drop any previous content
2085 self.db.execute("DELETE FROM autnum_feeds WHERE source = %s", name)
2086 self.db.execute("DELETE FROM network_feeds WHERE source = %s", name)
dcef2ba4 2087
a775372b 2088 # Call the callback to process the feed
c86209e2
MT
2089 with self.db.pipeline():
2090 return await callback(name, f, *args)
a775372b 2091
98d9079b 2092 async def _import_aws_ip_ranges(self, name, f):
a775372b 2093 # Parse the feed
c30e421e 2094 feed = json.load(f)
68fdeb3b 2095
99e4a4eb 2096 # Set up a dictionary for mapping a region name to a country. Unfortunately,
dcef2ba4
PM
2097 # there seems to be no machine-readable version available of this other than
2098 # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html
2099 # (worse, it seems to be incomplete :-/ ); https://www.cloudping.cloud/endpoints
2100 # was helpful here as well.
2101 aws_region_country_map = {
99e4a4eb
MT
2102 # Africa
2103 "af-south-1" : "ZA",
2104
2105 # Asia
2106 "il-central-1" : "IL", # Tel Aviv
2107
2108 # Asia/Pacific
2109 "ap-northeast-1" : "JP",
2110 "ap-northeast-2" : "KR",
2111 "ap-northeast-3" : "JP",
2112 "ap-east-1" : "HK",
2113 "ap-south-1" : "IN",
2114 "ap-south-2" : "IN",
2115 "ap-southeast-1" : "SG",
2116 "ap-southeast-2" : "AU",
2117 "ap-southeast-3" : "MY",
2118 "ap-southeast-4" : "AU",
2119 "ap-southeast-5" : "NZ", # Auckland, NZ
2120 "ap-southeast-6" : "AP", # XXX: Precise location not documented anywhere
2121
2122 # Canada
2123 "ca-central-1" : "CA",
2124 "ca-west-1" : "CA",
2125
2126 # Europe
2127 "eu-central-1" : "DE",
2128 "eu-central-2" : "CH",
2129 "eu-north-1" : "SE",
2130 "eu-west-1" : "IE",
2131 "eu-west-2" : "GB",
2132 "eu-west-3" : "FR",
2133 "eu-south-1" : "IT",
2134 "eu-south-2" : "ES",
2135
2136 # Middle East
2137 "me-central-1" : "AE",
2138 "me-south-1" : "BH",
2139
2140 # South America
2141 "sa-east-1" : "BR",
2142
2143 # Undocumented, likely located in Berlin rather than Frankfurt
2144 "eusc-de-east-1" : "DE",
2145 }
dcef2ba4 2146
c30e421e
MT
2147 # Collect a list of all networks
2148 prefixes = feed.get("ipv6_prefixes", []) + feed.get("prefixes", [])
dcef2ba4 2149
c30e421e
MT
2150 for prefix in prefixes:
2151 # Fetch network
2152 network = prefix.get("ipv6_prefix") or prefix.get("ip_prefix")
2153
2154 # Parse the network
a775372b 2155 try:
c30e421e
MT
2156 network = ipaddress.ip_network(network)
2157 except ValuleError as e:
2158 log.warning("%s: Unable to parse prefix %s" % (name, network))
a775372b 2159 continue
dcef2ba4 2160
a775372b
MT
2161 # Sanitize parsed networks...
2162 if not self._check_parsed_network(network):
2163 continue
dcef2ba4 2164
c30e421e
MT
2165 # Fetch the region
2166 region = prefix.get("region")
2167
2168 # Set some defaults
a775372b
MT
2169 cc = None
2170 is_anycast = False
2171
c30e421e
MT
2172 # Fetch the CC from the dictionary
2173 try:
a775372b 2174 cc = aws_region_country_map[region]
dcef2ba4 2175
c30e421e
MT
2176 # If we couldn't find anything, let's try something else...
2177 except KeyError as e:
2178 # Find anycast networks
2179 if region == "GLOBAL":
2180 is_anycast = True
2181
2182 # Everything that starts with us- is probably in the United States
2183 elif region.startswith("us-"):
2184 cc = "US"
2185
2186 # Everything that starts with cn- is probably China
2187 elif region.startswith("cn-"):
2188 cc = "CN"
2189
2190 # Log a warning for anything else
2191 else:
2192 log.warning("%s: Could not determine country code for AWS region %s" \
2193 % (name, region))
2194 continue
dcef2ba4 2195
c30e421e 2196 # Write to database
a775372b
MT
2197 self.db.execute("""
2198 INSERT INTO
2199 network_feeds
2200 (
2201 network,
2202 source,
2203 country,
2204 is_anycast
dcef2ba4 2205 )
a775372b
MT
2206 VALUES
2207 (
2208 %s, %s, %s, %s
2209 )
2210 ON CONFLICT (network, source) DO NOTHING
c30e421e 2211 """, "%s" % network, name, cc, is_anycast,
a775372b 2212 )
dcef2ba4 2213
98d9079b 2214 async def _import_spamhaus_drop(self, name, f):
377cead4
MT
2215 """
2216 Import Spamhaus DROP IP feeds
2217 """
2218 # Count all lines
2219 lines = 0
69b3d894 2220
377cead4
MT
2221 # Walk through all lines
2222 for line in f:
2223 # Decode line
2224 line = line.decode("utf-8")
69b3d894 2225
377cead4
MT
2226 # Strip off any comments
2227 line, _, comment = line.partition(";")
69b3d894 2228
377cead4
MT
2229 # Ignore empty lines
2230 if not line:
2231 continue
85e44d9a 2232
377cead4
MT
2233 # Strip any excess whitespace
2234 line = line.strip()
69b3d894 2235
377cead4
MT
2236 # Increment line counter
2237 lines += 1
5acac2a4 2238
377cead4
MT
2239 # Parse the network
2240 try:
2241 network = ipaddress.ip_network(line)
2242 except ValueError as e:
2243 log.warning("%s: Could not parse network: %s - %s" % (name, line, e))
2244 continue
69b3d894 2245
377cead4
MT
2246 # Check network
2247 if not self._check_parsed_network(network):
2248 log.warning("%s: Skipping bogus network: %s" % (name, network))
2249 continue
69b3d894 2250
377cead4
MT
2251 # Insert into the database
2252 self.db.execute("""
2253 INSERT INTO
2254 network_feeds
2255 (
2256 network,
2257 source,
2258 is_drop
2259 )
2260 VALUES
2261 (
2262 %s, %s, %s
2263 )""", "%s" % network, name, True,
2264 )
69b3d894 2265
377cead4
MT
2266 # Raise an exception if we could not import anything
2267 if not lines:
2268 raise RuntimeError("Received bogus feed %s with no data" % name)
69b3d894 2269
98d9079b 2270 async def _import_spamhaus_asndrop(self, name, f):
5dcda5c8
MT
2271 """
2272 Import Spamhaus ASNDROP feed
2273 """
2274 for line in f:
2275 # Decode the line
2276 line = line.decode("utf-8")
5acac2a4 2277
5dcda5c8
MT
2278 # Parse JSON
2279 try:
2280 line = json.loads(line)
2281 except json.JSONDecodeError as e:
2282 log.warning("%s: Unable to parse JSON object %s: %s" % (name, line, e))
2283 continue
69b3d894 2284
5dcda5c8
MT
2285 # Fetch type
2286 type = line.get("type")
69b3d894 2287
5dcda5c8
MT
2288 # Skip any metadata
2289 if type == "metadata":
2290 continue
69b3d894 2291
5dcda5c8
MT
2292 # Fetch ASN
2293 asn = line.get("asn")
69b3d894 2294
5dcda5c8
MT
2295 # Skip any lines without an ASN
2296 if not asn:
2297 continue
69b3d894 2298
5dcda5c8
MT
2299 # Filter invalid ASNs
2300 if not self._check_parsed_asn(asn):
2301 log.warning("%s: Skipping bogus ASN %s" % (name, asn))
2302 continue
69b3d894 2303
5dcda5c8
MT
2304 # Write to database
2305 self.db.execute("""
2306 INSERT INTO
2307 autnum_feeds
2308 (
2309 number,
2310 source,
2311 is_drop
2312 )
2313 VALUES
2314 (
2315 %s, %s, %s
2316 )""", "%s" % asn, name, True,
2317 )
69b3d894 2318
28d29b7c
MT
2319 @staticmethod
2320 def _parse_bool(block, key):
2321 val = block.get(key)
2322
2323 # There is no point to proceed when we got None
2324 if val is None:
2325 return
2326
2327 # Convert to lowercase
2328 val = val.lower()
2329
2330 # True
2331 if val in ("yes", "1"):
2332 return True
2333
2334 # False
2335 if val in ("no", "0"):
2336 return False
2337
2338 # Default to None
2339 return None
2340
98d9079b 2341 async def handle_import_countries(self, ns):
8084b33a
MT
2342 with self.db.transaction():
2343 # Drop all data that we have
2344 self.db.execute("TRUNCATE TABLE countries")
2345
2346 for file in ns.file:
2347 for line in file:
2348 line = line.rstrip()
2349
2350 # Ignore any comments
2351 if line.startswith("#"):
2352 continue
2353
2354 try:
2355 country_code, continent_code, name = line.split(maxsplit=2)
2356 except:
2357 log.warning("Could not parse line: %s" % line)
2358 continue
2359
2360 self.db.execute("INSERT INTO countries(country_code, name, continent_code) \
2361 VALUES(%s, %s, %s) ON CONFLICT DO NOTHING", country_code, name, continent_code)
2362
6ffd06b5
MT
2363
2364def split_line(line):
2365 key, colon, val = line.partition(":")
2366
2367 # Strip any excess space
2368 key = key.strip()
2369 val = val.strip()
78ff0cf2 2370
6ffd06b5 2371 return key, val
78ff0cf2 2372
8c3529c7
MT
2373def read_blocks(f):
2374 for block in iterate_over_blocks(f):
2375 type = None
2376 data = {}
2377
2378 for i, line in enumerate(block):
2379 key, value = line.split(":", 1)
2380
2381 # The key of the first line defines the type
2382 if i == 0:
2383 type = key
2384
f5a4557d
MT
2385 # Strip any excess whitespace
2386 value = value.strip()
2387
2388 # Store some values as a list
2389 if type == "geofeed" and key == "network":
2390 try:
2391 data[key].append(value)
2392 except KeyError:
2393 data[key] = [value]
2394
2395 # Otherwise store the value as string
2396 else:
2397 data[key] = value
8c3529c7
MT
2398
2399 yield type, data
2400
2401def iterate_over_blocks(f, charsets=("utf-8", "latin1")):
2402 block = []
2403
2404 for line in f:
2405 # Skip commented lines
2406 if line.startswith(b"#") or line.startswith(b"%"):
2407 continue
2408
2409 # Convert to string
2410 for charset in charsets:
2411 try:
2412 line = line.decode(charset)
2413 except UnicodeDecodeError:
2414 continue
2415 else:
2416 break
2417
2418 # Remove any comments at the end of line
2419 line, hash, comment = line.partition("#")
2420
2421 # Strip any whitespace at the end of the line
2422 line = line.rstrip()
2423
2424 # If we cut off some comment and the line is empty, we can skip it
2425 if comment and not line:
2426 continue
2427
2428 # If the line has some content, keep collecting it
2429 if line:
2430 block.append(line)
2431 continue
2432
2433 # End the block on an empty line
2434 if block:
2435 yield block
2436
2437 # Reset the block
2438 block = []
2439
2440 # Return the last block
2441 if block:
2442 yield block
2443
2444def iterate_over_lines(f):
2445 for line in f:
2446 # Decode the line
2447 line = line.decode()
2448
2449 # Strip the ending
2450 yield line.rstrip()
2451
98d9079b 2452async def main():
78ff0cf2
MT
2453 # Run the command line interface
2454 c = CLI()
78ff0cf2 2455
98d9079b
MT
2456 await c.run()
2457
2458asyncio.run(main())