]> git.ipfire.org Git - dbl.git/commitdiff
lists: Optimize the lists
authorMichael Tremer <michael.tremer@ipfire.org>
Fri, 9 Jan 2026 10:47:47 +0000 (10:47 +0000)
committerMichael Tremer <michael.tremer@ipfire.org>
Fri, 9 Jan 2026 10:47:47 +0000 (10:47 +0000)
This is a quick solution to find any listed subdomains. Those don't have
to be exported if we already have the parent domain listed. This will
decrease the size of the lists.

Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
src/database.sql
src/dnsbl/checker.py
src/dnsbl/domains.py
src/dnsbl/lists.py
src/scripts/dnsbl.in

index 82b5501c58557e7e9b482fec8003fcfcc83b6b70..9b9dabbef76202a47aa003a5d554cc3dea6bb613 100644 (file)
@@ -2,7 +2,7 @@
 -- PostgreSQL database dump
 --
 
-\restrict 8o0t5OTJIfaTvsY8tJmP42PzP48zYSIzdnWkVDzdbzece2fzwS3EaVGiSKAGHZF
+\restrict zZebqk0H6Bn3b2lu7FY4z6lPZCN2XBuuNpQTisoH3Wz8jUXzJwWewQlrEzRq9r7
 
 -- Dumped from database version 17.6 (Debian 17.6-0+deb13u1)
 -- Dumped by pg_dump version 17.6 (Debian 17.6-0+deb13u1)
@@ -76,7 +76,8 @@ CREATE TABLE public.domains (
     report_add_id uuid,
     report_remove_id uuid,
     checked_at timestamp with time zone,
-    dead boolean DEFAULT false
+    dead boolean DEFAULT false,
+    subsumed boolean DEFAULT false NOT NULL
 );
 
 
@@ -638,5 +639,5 @@ ALTER TABLE ONLY public.sources
 -- PostgreSQL database dump complete
 --
 
-\unrestrict 8o0t5OTJIfaTvsY8tJmP42PzP48zYSIzdnWkVDzdbzece2fzwS3EaVGiSKAGHZF
+\unrestrict zZebqk0H6Bn3b2lu7FY4z6lPZCN2XBuuNpQTisoH3Wz8jUXzJwWewQlrEzRq9r7
 
index 1e6baee271905f159374cc5bdd4d09e1c0bbecec..1bd1aaa9c60028568a7b0b1da97d80d76640946a 100644 (file)
@@ -139,6 +139,10 @@ class Checker(object):
        def resolve(self, domain):
                log.debug("Resolving %s..." % domain)
 
+               # We want to check if the domain still exists and for that querying
+               # the top domain is enough.
+               domain = self.backend.psl.get_sld(domain)
+
                try:
                        result = self.resolver.resolve(domain, "SOA", search=False, lifetime=60)
 
index 085b6790f0b558374cd51d006721d76f028904a5..05a45002c2b28131bf080d81f9f647fc345aa494 100644 (file)
@@ -140,6 +140,9 @@ class Domain(sqlmodel.SQLModel, database.BackendMixin, table=True):
        # Dead?
        dead: bool = False
 
+       # Subsumed?
+       subsumed: bool = False
+
 
 class DomainEvent(sqlmodel.SQLModel, table=True):
        """
index 83c625b1f41de2ce0144976febabf36e186e0699..f25924b94e9da5dbbccef264f495c41c037d7fb5 100644 (file)
@@ -264,6 +264,9 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
                                # Ignore domains that have been removed
                                domains.Domain.removed_at == None,
 
+                               # Ignore any domains that are subsumed by another domain
+                               domains.Domain.subsumed == False,
+
                                # Only select domains that are not dead
                                # or have not been checked, yet.
                                sqlmodel.or_(
@@ -461,6 +464,9 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
                        if updated:
                                self.updated_at = sqlmodel.func.current_timestamp()
 
+                       # Optimize the list
+                       self.optimize(update_stats=False)
+
                        # Update the stats
                        self.update_stats()
 
@@ -744,6 +750,81 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
 
                return self.backend.db.fetch(stmt)
 
+       def optimize(self, update_stats=True):
+               """
+                       Optimizes this list
+               """
+               log.info("Optimizing %s..." % self)
+
+               # Fetch all domains on this list
+               names = self.backend.db.fetch_as_set(
+                       sqlmodel
+                       .select(
+                               domains.Domain.name
+                       )
+                       .distinct()
+                       .where(
+                               domains.Domain.list == self,
+                               domains.Domain.removed_at == None,
+                       )
+               )
+
+               # Collect all names that are redundant
+               redundant_names = set()
+
+               # Walk through all domains
+               for name in names:
+                       parent = name
+
+                       # Check if any parent domain is also listed
+                       while "." in parent:
+                               *garbage, parent = parent.partition(".")
+
+                               # If the domain is already listed, we ignore it
+                               if parent in names:
+                                       redundant_names.add(name)
+                                       break
+
+               log.info(_("Identified %s redunduant domain(s)") % len(redundant_names))
+
+               # Reset the status for all domains
+               self.backend.db.execute(
+                       sqlmodel
+                       .update(
+                               domains.Domain,
+                       )
+                       .values(
+                               subsumed = False,
+                       )
+                       .where(
+                               domains.Domain.list == self,
+                               domains.Domain.removed_at == None,
+                               domains.Domain.subsumed == True,
+                       )
+               )
+
+               # De-list the redundant domains
+               self.backend.db.execute(
+                       sqlmodel
+                       .update(
+                               domains.Domain,
+                       )
+                       .values(
+                               subsumed = True,
+                       )
+                       .where(
+                               domains.Domain.list == self,
+                               domains.Domain.removed_at == None,
+                               domains.Domain.name.in_(
+                                       redundant_names,
+                               )
+                       )
+               )
+
+               # Update all stats afterwards
+               if update_stats:
+                       self.update_stats()
+
 
 class ListStats(sqlmodel.SQLModel, table=True):
        __tablename__ = "list_stats"
index 89c46708091ad3e636417701944f5900e7f2f628..2d88038df427069d45426475dbd4389a02a85241 100644 (file)
@@ -25,6 +25,7 @@ import babel.numbers
 import dnsbl
 import dnsbl.checker
 import dnsbl.exporters
+import dnsbl.util
 import logging
 import os
 import rich.console
@@ -156,6 +157,11 @@ class CLI(object):
                analyze.add_argument("list", help=_("The name of the list"))
                analyze.set_defaults(func=self.__analyze)
 
+               # optimize
+               optimize = subparsers.add_parser("optimize", help=_("Optimize a list"))
+               optimize.add_argument("list", help=_("The name of the list"))
+               optimize.set_defaults(func=self.__optimize)
+
                # history
                history = subparsers.add_parser("history",
                                help=_("Shows the latest changes of a list"))
@@ -507,6 +513,16 @@ class CLI(object):
                # Print the table
                self.console.print(table)
 
+       def __optimize(self, backend, args):
+               """
+                       Optimizes a list
+               """
+               # Fetch the list
+               list = self.__get_list(backend, args.list)
+
+               with dnsbl.util.Stopwatch(_("Optimizing %s") % list):
+                       list.optimize()
+
        def __history(self, backend, args):
                """
                        Shows the history of a list