]> git.ipfire.org Git - dbl.git/commitdiff
checker: Check if domains exist by querying their SOA
authorMichael Tremer <michael.tremer@ipfire.org>
Sun, 28 Dec 2025 13:10:51 +0000 (13:10 +0000)
committerMichael Tremer <michael.tremer@ipfire.org>
Sun, 28 Dec 2025 13:10:51 +0000 (13:10 +0000)
This way, we should be able to massively reduce the blacklists because
so many domains actually no longer exist.

Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
Makefile.am
src/database.sql
src/dnsbl/checker.py [new file with mode: 0644]
src/dnsbl/database.py
src/scripts/dnsbl.in

index d2e0c309a907b0b7be0c897c804b6b37e0fc96f8..5bfca6a70209f81db0b556fa632e869665c2cb3a 100644 (file)
@@ -51,6 +51,7 @@ SED_PROCESS = \
 
 dist_pkgpython_PYTHON = \
        src/dnsbl/__init__.py \
+       src/dnsbl/checker.py \
        src/dnsbl/database.py \
        src/dnsbl/exporters.py \
        src/dnsbl/i18n.py \
index 489861599f334414493cf1ce86031c15cae98156..3faa2f9518eba55c0192c50b00d90813d3b3b39c 100644 (file)
@@ -2,7 +2,7 @@
 -- PostgreSQL database dump
 --
 
-\restrict 6nJVr9P5tK3JEHqsf1GqOxn2ArIH0TeeAQGlXEqO2KxOZaX0g6qxzMEPFvsyQVJ
+\restrict Xf18Fi8Ow9mJBKm7A0YHydhg05f2dQMA1HMDnTkGw1D1d51sv6ya9FmQD2QjJyr
 
 -- Dumped from database version 17.6 (Debian 17.6-0+deb13u1)
 -- Dumped by pg_dump version 17.6 (Debian 17.6-0+deb13u1)
@@ -23,6 +23,17 @@ SET default_tablespace = '';
 
 SET default_table_access_method = heap;
 
+--
+-- Name: checker_domains; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.checker_domains (
+    name text NOT NULL,
+    checked_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
+    status boolean NOT NULL
+);
+
+
 --
 -- Name: lists; Type: TABLE; Schema: public; Owner: -
 --
@@ -194,6 +205,14 @@ ALTER TABLE ONLY public.source_domains ALTER COLUMN id SET DEFAULT nextval('publ
 ALTER TABLE ONLY public.sources ALTER COLUMN id SET DEFAULT nextval('public.sources_id_seq'::regclass);
 
 
+--
+-- Name: checker_domains checker_domains_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.checker_domains
+    ADD CONSTRAINT checker_domains_pkey PRIMARY KEY (name);
+
+
 --
 -- Name: lists lists_pkey; Type: CONSTRAINT; Schema: public; Owner: -
 --
@@ -274,5 +293,5 @@ ALTER TABLE ONLY public.sources
 -- PostgreSQL database dump complete
 --
 
-\unrestrict 6nJVr9P5tK3JEHqsf1GqOxn2ArIH0TeeAQGlXEqO2KxOZaX0g6qxzMEPFvsyQVJ
+\unrestrict Xf18Fi8Ow9mJBKm7A0YHydhg05f2dQMA1HMDnTkGw1D1d51sv6ya9FmQD2QjJyr
 
diff --git a/src/dnsbl/checker.py b/src/dnsbl/checker.py
new file mode 100644 (file)
index 0000000..9af6dc0
--- /dev/null
@@ -0,0 +1,213 @@
+###############################################################################
+#                                                                             #
+# dnsbl - A DNS Blocklist Compositor For IPFire                               #
+# Copyright (C) 2025 IPFire Development Team                                  #
+#                                                                             #
+# This program is free software: you can redistribute it and/or modify        #
+# it under the terms of the GNU General Public License as published by        #
+# the Free Software Foundation, either version 3 of the License, or           #
+# (at your option) any later version.                                         #
+#                                                                             #
+# This program is distributed in the hope that it will be useful,             #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of              #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
+# GNU General Public License for more details.                                #
+#                                                                             #
+# You should have received a copy of the GNU General Public License           #
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
+#                                                                             #
+###############################################################################
+
+import concurrent.futures
+import datetime
+import dns.rdatatype
+import dns.resolver
+import logging
+import sqlalchemy.dialects.postgresql
+import sqlmodel
+
+from . import database
+from . import sources
+
+# Setup logging
+log = logging.getLogger(__name__)
+
+class Checker(object):
+       """
+               The checker checks if a domain is still alive, i.e. resolves.
+       """
+       def __init__(self, backend):
+               self.backend = backend
+
+               # Initialize the executor
+               self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=64)
+
+               # Initialize the resolver
+               self.resolver = dns.resolver.Resolver()
+
+               self.results = {}
+
+       def check(self, batch_size=1024):
+               """
+                       Checks all domains that need checking.
+               """
+               threshold = 64
+
+               with self.executor:
+                       while True:
+                               # Submit some more tasks whenever we run low
+                               if len(self.results) < threshold:
+                                       for domain in self.get_domains(batch_size):
+                                               self.submit(domain)
+
+                                       # Manually commit after a batch has been processed
+                                       self.backend.db.commit()
+
+                               # Terminate if we have no domains left to check
+                               if not self.results:
+                                       break
+
+                               try:
+                                       for result in concurrent.futures.as_completed(self.results, timeout=1):
+                                               self._store(result)
+
+                               # If nothing has completed, we just start a new iteration
+                               except TimeoutError:
+                                       pass
+
+       def get_domains(self, limit=None):
+               """
+                       Returns all domains that need checking
+               """
+               cutoff = datetime.datetime.now() - datetime.timedelta(weeks=4)
+
+               stmt = (
+                       sqlmodel
+                       .select(
+                               sources.SourceDomain.name,
+                       )
+                       .join(
+                               CheckerDomain,
+                               sources.SourceDomain.name == CheckerDomain.name,
+                               isouter=True,
+                       )
+                       .where(
+                               sources.SourceDomain.removed_at == None,
+
+                               # Only return domains that have not been checked or where the last check
+                               # was at least 4 weeks ago
+                               sqlmodel.or_(
+                                       CheckerDomain.checked_at == None,
+                                       CheckerDomain.checked_at <= cutoff,
+                               ),
+                       )
+                       .order_by(
+                               sqlmodel.nullsfirst(CheckerDomain.checked_at),
+                               sources.SourceDomain.name,
+                       )
+               )
+
+               # Apply the limit (if any)
+               if limit:
+                       stmt = stmt.limit(limit)
+
+               return self.backend.db.fetch(stmt)
+
+       def submit(self, domain, hostname=None):
+               """
+                       Submits a new job to the queue
+               """
+               if hostname is None:
+                       hostname = domain
+
+               result = self.executor.submit(self.resolve, hostname)
+
+               self.results[result] = domain
+
+       def resolve(self, domain):
+               log.debug("Resolving %s..." % domain)
+
+               # Fetch the result
+               return self.resolver.resolve(domain, "SOA", search=False, lifetime=60)
+
+       def _store(self, result):
+               """
+                       Called after we have received a result for the queried domain
+               """
+               # Fetch the domain name
+               domain = self.results.pop(result)
+
+               # Fetch the result or raise any exceptions
+               try:
+                       result = result.result()
+
+               # The response did not contain an answer to our question
+               except dns.resolver.NoAnswer as e:
+                       response = e.response()
+
+                       # If we have received a CNAME, we will resolve again
+                       if response:
+                               for rrset in response.answer:
+                                       if rrset.rdtype == dns.rdatatype.CNAME:
+                                               for record in rrset:
+                                                       hostname = record.target.to_text(omit_final_dot=True)
+
+                                                       return self.submit(domain, hostname=hostname)
+
+                       # If there has been no response, we assume that the domain does not exist
+                       status = False
+
+               # NXDOMAIN
+               except dns.resolver.NXDOMAIN as e:
+                       status = False
+
+               # SERVFAIL
+               except dns.resolver.NoNameservers as e:
+                       status = False
+
+               # Raise any other exception
+               except Exception as e:
+                       raise e
+
+               # There has been no exception, the query returned some data
+               else:
+                       status = True
+
+               log.debug("Storing result for %s..." % domain)
+
+               stmt = (
+                       sqlalchemy.dialects.postgresql
+                       .insert(
+                               CheckerDomain,
+                       )
+                       .values({
+                               "name"   : domain,
+                               "status" : status,
+                       })
+                       .on_conflict_do_update(
+                               index_elements = [
+                                       CheckerDomain.name,
+                               ],
+                               set_ = {
+                                       "checked_at" : sqlmodel.func.current_timestamp(),
+                               }
+                       )
+               )
+
+               # Store the result
+               self.backend.db.execute(stmt)
+
+
+class CheckerDomain(sqlmodel.SQLModel, database.BackendMixin, table=True):
+       __tablename__ = "checker_domains"
+
+       # Name
+       name: str = sqlmodel.Field(primary_key=True)
+
+       # Checked At
+       checked_at : datetime.datetime = sqlmodel.Field(
+               sa_column_kwargs = {"server_default" : sqlmodel.text("CURRENT_TIMESTAMP")}
+       )
+
+       # Status
+       status : bool
index f4710e582f8fe27b9d396f919492b7cae4bfa601..0eaff3befc3ef3873054db57af68493aee747d4d 100644 (file)
@@ -164,6 +164,16 @@ class Database(object):
                # Return as set
                return set([o for o in objects])
 
+       def commit(self):
+               """
+                       Manually triggers a database commit
+               """
+               # Fetch our session
+               session = self.session()
+
+               # Commit!
+               session.commit()
+
 
 class BackendMixin:
        @functools.cached_property
index 05ff8007d336356d848bae00767f4296569dd137..db70b130e296f2c893731dae4d7a55bd4ba9e4d0 100644 (file)
@@ -22,6 +22,7 @@
 import argparse
 import babel.numbers
 import dnsbl
+import dnsbl.checker
 import dnsbl.exporters
 import logging
 import os
@@ -151,6 +152,11 @@ class CLI(object):
                analyze.add_argument("list", help=_("The name of the list"))
                analyze.set_defaults(func=self.__analyze)
 
+               # check-domains
+               check_domains = subparsers.add_parser("check-domains",
+                               help=_("Checks if domains are alive"))
+               check_domains.set_defaults(func=self.__check_domains)
+
                # Parse all arguments
                args = parser.parse_args()
 
@@ -472,6 +478,13 @@ class CLI(object):
                # Print the table
                self.console.print(table)
 
+       def __check_domains(self, backend, args):
+               """
+                       Runs the checker over all domains
+               """
+               checker = dnsbl.checker.Checker(backend)
+               checker.check()
+
 
 def main():
        c = CLI()