]> git.ipfire.org Git - dbl.git/commitdiff
lists: Mark domains as listed for faster search
authorMichael Tremer <michael.tremer@ipfire.org>
Fri, 27 Feb 2026 10:50:53 +0000 (10:50 +0000)
committerMichael Tremer <michael.tremer@ipfire.org>
Fri, 27 Feb 2026 10:50:53 +0000 (10:50 +0000)
The query that is determining which domains are whitelisted has always
been very slow and there is no feasible way to accellerate it using
indexes, etc.

Therefore we will download all whitelisted domains and all potentially
blockable domains and perform the check in the Python application. That
way, we can later mark any delisted domains and fetch the entire list of
domains reasonably fast.

Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
src/dbl/lists.py
src/dbl/util.py

index 0cc43b447b830f3f11e0e8cfb00f356f1d437fee..70e03a6d86e121fa1834899cb3bd8fcf8ea4b09e 100644 (file)
@@ -272,109 +272,43 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
 
                return canary
 
-       @functools.cached_property
-       def __domains(self):
+       async def get_domains(self):
                """
-                       A CTE to access all (active) domains on this list
+                       Returns all domains that are on this list
                """
-               # Fetch all domains that should be blocked
-               blocked_domains = (
+               names = self.backend.db.fetch(
                        sqlmodel
                        .select(
-                               domains.Domain,
-                       )
-                       .where(
-                               # Select only domains from this list
-                               domains.Domain.list == self,
-
-                               # Only select domains that should be blocked
-                               domains.Domain.block == True,
-
-                               # Ignore domains that have been removed
-                               domains.Domain.removed_at == None,
-
-                               # Ignore any domains that are subsumed by another domain
-                               domains.Domain.subsumed == False,
-
-                               # Only select domains that are not dead
-                               # or have not been checked, yet.
-                               sqlmodel.or_(
-                                       domains.Domain.dead == None,
-                                       domains.Domain.dead == False,
-                               ),
-                       )
-                       .cte("blocked_domains")
-               )
-
-               # Fetch all whitelisted domains
-               whitelisted_domains = (
-                       sqlmodel
-                       .select(
-                               domains.Domain,
+                               domains.Domain.name,
                        )
+                       .distinct()
                        .where(
-                               # Select only domains from this list
+                               # Only fetch domains from this list
                                domains.Domain.list == self,
 
-                               # Only select domains that should not be blocked
-                               domains.Domain.block == False,
-
-                               # Ignore domains that have been removed
+                               # Domains cannot be removed
                                domains.Domain.removed_at == None,
-                       )
-                       .cte("whitelisted_domains")
-               )
 
-               # Remove any whitelisted and subdomains of any whitelisted domains
-               # from the list of blocked domains
-               listed_domains = (
-                       sqlmodel
-                       .select(
-                               blocked_domains.c.name,
-                       )
-                       .distinct(
-                               blocked_domains.c.name,
-                       )
-                       .where(
-                               ~sqlmodel.exists(
-                                       sqlmodel
-                                       .select(
-                                               whitelisted_domains.c.name,
-                                       )
-                                       .where(
-                                               (blocked_domains.c.name == whitelisted_domains.c.name) |
-                                               (blocked_domains.c.name.like("%." + whitelisted_domains.c.name))
-                                       )
-                               )
-                       )
-                       .cte("listed_domains")
-               )
-
-               return listed_domains
-
-       async def get_domains(self):
-               """
-                       Returns all domains that are on this list
-               """
-               domains = self.backend.db.fetch(
-                       sqlmodel
-                       .select(
-                               self.__domains.c.name,
+                               # Only fetch listed domains
+                               domains.Domain.listed == True,
                        )
+                       .order_by(
+                               domains.Domain.name,
+                       ),
                )
 
                canary_inserted = False
 
                # Walk through all domains and insert the canary
-               async for domain in domains:
+               async for name in names:
                        # If we have not inserted the canary, yet, we will do
                        # it whenever it alphabetically fits
-                       if not canary_inserted and domain > self.canary:
+                       if not canary_inserted and name > self.canary:
                                yield self.canary
                                canary_inserted = True
 
                        # Add the domain, too
-                       yield domain
+                       yield name
 
                # If we have added all domains but not the canary, we will add the canary anyways
                if not canary_inserted:
@@ -517,17 +451,28 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
                        await self.update_stats()
 
        async def update_stats(self):
-               stmt = (
+               # Store the number of total domains
+               self.total_domains = await self.backend.db.fetch_one(
                        sqlmodel
                        .select(
-                               sqlmodel.func.count(
-                                       self.__domains.c.name,
-                               ),
+                               sqlmodel.func.count(),
                        )
-               )
+                       .where(
+                               domains.Domain.list == self,
 
-               # Store the number of total domains
-               self.total_domains = await self.backend.db.fetch_one(stmt)
+                               # Select only domains from this list
+                               domains.Domain.list == self,
+
+                               # Only select domains that should be blocked
+                               domains.Domain.block == True,
+
+                               # Ignore domains that have been removed
+                               domains.Domain.removed_at == None,
+
+                               # Only count listed domains
+                               domains.Domain.listed == True,
+                       )
+               )
 
                # Store the number of subsumed domains
                self.subsumed_domains = await self.backend.db.fetch_one(
@@ -831,36 +776,65 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
                """
                log.info("Optimizing %s..." % self)
 
-               # Fetch all domains on this list
-               names = await self.backend.db.fetch_as_set(
+               # Fetch all whitelisted domains
+               whitelisted = await self.backend.db.fetch_as_set(
                        sqlmodel
                        .select(
-                               domains.Domain.name
+                               domains.Domain.name,
                        )
                        .distinct()
                        .where(
+                               # Select only domains from this list
                                domains.Domain.list == self,
+
+                               # Only select domains that should not be blocked
+                               domains.Domain.block == False,
+
+                               # Ignore domains that have been removed
                                domains.Domain.removed_at == None,
+                       ),
+               )
+
+               # Fetch all potentially blocked domains
+               names = await self.backend.db.fetch_as_set(
+                       sqlmodel
+                       .select(
+                               domains.Domain.name,
                        )
+                       .where(
+                               # Select only domains from this list
+                               domains.Domain.list == self,
+
+                               # Only select domains that should be blocked
+                               domains.Domain.block == True,
+
+                               # Ignore domains that have been removed
+                               domains.Domain.removed_at == None,
+
+                               # Only select domains that are not dead
+                               # or have not been checked, yet.
+                               sqlmodel.or_(
+                                       domains.Domain.dead == None,
+                                       domains.Domain.dead == False,
+                               ),
+                       ),
                )
 
+               # Collect all names that should actually be listed
+               delisted_names = set()
+
                # Collect all names that are redundant
                redundant_names = set()
 
-               # Walk through all domains
+               # List everything that isn't whitelisted
                for name in names:
-                       parent = name
+                       if util.is_name_in(name, whitelisted):
+                               delisted_names.add(name)
 
-                       # Check if any parent domain is also listed
-                       while "." in parent:
-                               *garbage, parent = parent.partition(".")
-
-                               # If the domain is already listed, we ignore it
-                               if parent in names:
+               # Walk through all domains
+               for name in names:
+                       if util.is_parent_in(name, names):
                                        redundant_names.add(name)
-                                       break
-
-               log.info(_("Identified %s redunduant domain(s)") % len(redundant_names))
 
                # Reset the status for all domains
                await self.backend.db.execute(
@@ -870,15 +844,19 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
                        )
                        .values(
                                subsumed = False,
+                               listed = True,
                        )
                        .where(
                                domains.Domain.list == self,
                                domains.Domain.removed_at == None,
-                               domains.Domain.subsumed == True,
+                               sqlmodel.or_(
+                                       domains.Domain.subsumed == True,
+                                       domains.Domain.listed == False,
+                               ),
                        )
                )
 
-               # De-list the redundant domains
+               # Mark any redundant domains
                for batch in itertools.batched(redundant_names, 1000):
                        await self.backend.db.execute(
                                sqlmodel
@@ -887,6 +865,7 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
                                )
                                .values(
                                        subsumed = True,
+                                       listed = False,
                                )
                                .where(
                                        domains.Domain.list == self,
@@ -897,6 +876,28 @@ class List(sqlmodel.SQLModel, database.BackendMixin, table=True):
                                )
                        )
 
+               # De-list anything that has been whitelisted
+               for batch in itertools.batched(delisted_names, 1000):
+                       await self.backend.db.execute(
+                               sqlmodel
+                               .update(
+                                       domains.Domain,
+                               )
+                               .values(
+                                       listed = False,
+                               )
+                               .where(
+                                       domains.Domain.list == self,
+                                       domains.Domain.removed_at == None,
+                                       domains.Domain.name.in_(
+                                               batch,
+                                       )
+                               )
+                       )
+
+               log.info(_("Identified %s redunduant domain(s)") % len(redundant_names))
+               log.info(_("De-listed %s domain(s)") % len(delisted_names))
+
                # Update all stats afterwards
                if update_stats:
                        await self.update_stats()
index 9bb60493d91256daa23a093bf960a405e4140e6b..b43f64d4f5acd505c8ab8ff179d400b0fbcd651b 100644 (file)
@@ -143,3 +143,26 @@ def make_verp_address(recipient):
        name, recipient = email.utils.parseaddr(recipient)
 
        return "bounces+%s@ipfire.org" % recipient.replace("@", "=")
+
+def is_name_in(name, names):
+       """
+               Checks if a name is listed in the given list or set of names
+               whilst performing sub-domain checks
+       """
+       # Check if any parent domain is also listed
+       while name:
+               # If the domain is already listed, we ignore it
+               if name in names:
+                       return True
+
+               # Trim off the left label
+               *garbage, name = name.partition(".")
+
+       # No match
+       return False
+
+def is_parent_in(name, names):
+       *garbage, parent = name.partition(".")
+
+       if parent:
+               return is_name_in(parent, names)