From: Michael Tremer Date: Fri, 2 Jan 2026 14:11:46 +0000 (+0000) Subject: sources: Rework the hosts file parser X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a286913c45fc70e58662e2a286c27cdcbdd6cbf1;p=dbl.git sources: Rework the hosts file parser Some sources use tabs to split and since we don't want to have a large list of IP addresses, we simply check if we now have a format where we list an IP address and something else. Signed-off-by: Michael Tremer --- diff --git a/src/dnsbl/sources.py b/src/dnsbl/sources.py index 5705daf..b58c311 100644 --- a/src/dnsbl/sources.py +++ b/src/dnsbl/sources.py @@ -24,6 +24,7 @@ import enum import gzip import httpx import idna +import ipaddress import io import itertools import logging @@ -39,12 +40,6 @@ from .i18n import _ # Setup logging log = logging.getLogger(__name__) -HOST_PREFIXES = set(( - "0.0.0.0 ", - "127.0.0.1 ", - "::1 ", -)) - IGNORED_DOMAINS = set(( "localhost", )) @@ -400,7 +395,7 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): return Format.ADBLOCKPLUS # Is this a hosts file? - elif any(line.startswith(prefix) for prefix in HOST_PREFIXES): + elif self._detect_format_hosts(line): return Format.HOSTS # Check for a plain FQDN @@ -410,6 +405,19 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): # The format is (still?) unknown return None + def _detect_format_hosts(self, line): + """ + Checks if the line is in hosts format + """ + domain = self._process_hosts(line) + + # If we could parse the domain, this looks like the hosts format + if domain: + return True + + # We could not parse anything + return False + def _process_adblockplus(self, line): """ Parse the domain from the AdBlockPlus format @@ -427,12 +435,23 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): """ Parses a line of a hosts file. """ - for prefix in HOST_PREFIXES: - if line.startswith(prefix): - return line.removeprefix(prefix) + try: + address, domain = line.split() - # If none of the prefixes matched, we return the entire line - return line + # If we could not split the line by two tokens, it is not a hosts file + except ValueError as e: + return + + # Check if we can parse the IP address + try: + ipaddress.ip_address(address) + + # If the line is not prefixed with an IP address, we cannot continue + except ValueError as e: + return + + # We have a valid IP address and therefore can return the domain + return domain def _process_plain(self, line): """