From: Michael Tremer Date: Wed, 10 Dec 2025 17:07:57 +0000 (+0000) Subject: sources: Insert domains in batches X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8f88ca0c3a605b6f3954f429e2b12d084098c0b5;p=dnsbl.git sources: Insert domains in batches Since too many database roundtrips are making the parsing of other large lists extremely slow, we will collect them all in a set (so that we will never have any duplicates) and insert them into the database in batches of 1000 domains at a time. Signed-off-by: Michael Tremer --- diff --git a/src/dnsbl/sources.py b/src/dnsbl/sources.py index 474e9a7..76e8fef 100644 --- a/src/dnsbl/sources.py +++ b/src/dnsbl/sources.py @@ -23,6 +23,7 @@ import email.utils import enum import gzip import io +import itertools import logging import sqlalchemy.dialects.postgresql import sqlmodel @@ -166,6 +167,9 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): # Initialize the format format = None + # Collect all domains + domains = set() + with self.db.transaction(): with self.backend.client() as client: # Compose some request headers @@ -210,17 +214,22 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): if not domain: continue - # Add the domain to the database - try: - self.add_domain(domain) - except ValueError as e: - log.warning("Failed to add '%s' to the database: %s" % (domain, e)) + # Skip any invalid domain names + if not util.is_fqdn(domain): + log.warning(_("Skipping invalid domain: %s") % domain) + continue + + # Add the domain + domains.add(domain) # Log an error if we could not detect the format if format is None: log.error("Format of '%s' (%s) seems to be unkown. No data could be parsed" \ % (self, self.url)) + # Add all domains to the database + self.add_domains(domains) + # The list has now been updated self.updated_at = sqlmodel.func.current_timestamp() @@ -329,34 +338,38 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): """ return line.removeprefix("0.0.0.0 ") - def add_domain(self, name): + def add_domains(self, domains): """ Adds or updates a domain. """ - # Check if this is a valid domain name - if not util.is_fqdn(name): - raise ValueError("Not a valid domain name: %s" % name) - - stmt = ( - sqlalchemy.dialects.postgresql - .insert( - SourceDomain, - ) - .values({ + # Create a generator to format the values + domains = ( + { "source_id" : self.id, - "name" : name, - }) - .on_conflict_do_update( - index_elements = [ - SourceDomain.source_id, SourceDomain.name, - ], - index_where = SourceDomain.removed_at == None, - set_ = { - "updated_at" : sqlmodel.func.current_timestamp(), - } - ) + "name" : domain, + } + for domain in domains ) - self.backend.db.execute(stmt) + + # Submit domains in batches of 1000 values + for values in itertools.batched(domains, 1000): + stmt = ( + sqlalchemy.dialects.postgresql + .insert( + SourceDomain, + ) + .values(values) + .on_conflict_do_update( + index_elements = [ + SourceDomain.source_id, SourceDomain.name, + ], + index_where = SourceDomain.removed_at == None, + set_ = { + "updated_at" : sqlmodel.func.current_timestamp(), + } + ) + ) + self.backend.db.execute(stmt) def __prune(self): """