]> git.ipfire.org Git - dnsbl.git/commitdiff
sources: Insert domains in batches
authorMichael Tremer <michael.tremer@ipfire.org>
Wed, 10 Dec 2025 17:07:57 +0000 (17:07 +0000)
committerMichael Tremer <michael.tremer@ipfire.org>
Wed, 10 Dec 2025 17:07:57 +0000 (17:07 +0000)
Since too many database roundtrips are making the parsing of other large
lists extremely slow, we will collect them all in a set (so that we will
never have any duplicates) and insert them into the database in batches
of 1000 domains at a time.

Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
src/dnsbl/sources.py

index 474e9a7dc46cd7f50d628ac4b11e9b5314f8ecd3..76e8fef184931ffb691a51226d58862c1e6c3662 100644 (file)
@@ -23,6 +23,7 @@ import email.utils
 import enum
 import gzip
 import io
+import itertools
 import logging
 import sqlalchemy.dialects.postgresql
 import sqlmodel
@@ -166,6 +167,9 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                # Initialize the format
                format = None
 
+               # Collect all domains
+               domains = set()
+
                with self.db.transaction():
                        with self.backend.client() as client:
                                # Compose some request headers
@@ -210,17 +214,22 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                                                if not domain:
                                                        continue
 
-                                               # Add the domain to the database
-                                               try:
-                                                       self.add_domain(domain)
-                                               except ValueError as e:
-                                                       log.warning("Failed to add '%s' to the database: %s" % (domain, e))
+                                               # Skip any invalid domain names
+                                               if not util.is_fqdn(domain):
+                                                       log.warning(_("Skipping invalid domain: %s") % domain)
+                                                       continue
+
+                                               # Add the domain
+                                               domains.add(domain)
 
                                        # Log an error if we could not detect the format
                                        if format is None:
                                                log.error("Format of '%s' (%s) seems to be unkown. No data could be parsed" \
                                                        % (self, self.url))
 
+                                       # Add all domains to the database
+                                       self.add_domains(domains)
+
                                # The list has now been updated
                                self.updated_at = sqlmodel.func.current_timestamp()
 
@@ -329,34 +338,38 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                """
                return line.removeprefix("0.0.0.0 ")
 
-       def add_domain(self, name):
+       def add_domains(self, domains):
                """
                        Adds or updates a domain.
                """
-               # Check if this is a valid domain name
-               if not util.is_fqdn(name):
-                       raise ValueError("Not a valid domain name: %s" % name)
-
-               stmt = (
-                       sqlalchemy.dialects.postgresql
-                       .insert(
-                               SourceDomain,
-                       )
-                       .values({
+               # Create a generator to format the values
+               domains = (
+                       {
                                "source_id" : self.id,
-                               "name"      : name,
-                       })
-                       .on_conflict_do_update(
-                               index_elements = [
-                                       SourceDomain.source_id, SourceDomain.name,
-                               ],
-                               index_where = SourceDomain.removed_at == None,
-                               set_ = {
-                                       "updated_at" : sqlmodel.func.current_timestamp(),
-                               }
-                       )
+                               "name"      : domain,
+                       }
+                       for domain in domains
                )
-               self.backend.db.execute(stmt)
+
+               # Submit domains in batches of 1000 values
+               for values in itertools.batched(domains, 1000):
+                       stmt = (
+                               sqlalchemy.dialects.postgresql
+                               .insert(
+                                       SourceDomain,
+                               )
+                               .values(values)
+                               .on_conflict_do_update(
+                                       index_elements = [
+                                               SourceDomain.source_id, SourceDomain.name,
+                                       ],
+                                       index_where = SourceDomain.removed_at == None,
+                                       set_ = {
+                                               "updated_at" : sqlmodel.func.current_timestamp(),
+                                       }
+                               )
+                       )
+                       self.backend.db.execute(stmt)
 
        def __prune(self):
                """