import enum
import gzip
import io
+import itertools
import logging
import sqlalchemy.dialects.postgresql
import sqlmodel
# Initialize the format
format = None
+ # Collect all domains
+ domains = set()
+
with self.db.transaction():
with self.backend.client() as client:
# Compose some request headers
if not domain:
continue
- # Add the domain to the database
- try:
- self.add_domain(domain)
- except ValueError as e:
- log.warning("Failed to add '%s' to the database: %s" % (domain, e))
+ # Skip any invalid domain names
+ if not util.is_fqdn(domain):
+ log.warning(_("Skipping invalid domain: %s") % domain)
+ continue
+
+ # Add the domain
+ domains.add(domain)
# Log an error if we could not detect the format
if format is None:
log.error("Format of '%s' (%s) seems to be unkown. No data could be parsed" \
% (self, self.url))
+ # Add all domains to the database
+ self.add_domains(domains)
+
# The list has now been updated
self.updated_at = sqlmodel.func.current_timestamp()
"""
return line.removeprefix("0.0.0.0 ")
- def add_domain(self, name):
+ def add_domains(self, domains):
"""
Adds or updates a domain.
"""
- # Check if this is a valid domain name
- if not util.is_fqdn(name):
- raise ValueError("Not a valid domain name: %s" % name)
-
- stmt = (
- sqlalchemy.dialects.postgresql
- .insert(
- SourceDomain,
- )
- .values({
+ # Create a generator to format the values
+ domains = (
+ {
"source_id" : self.id,
- "name" : name,
- })
- .on_conflict_do_update(
- index_elements = [
- SourceDomain.source_id, SourceDomain.name,
- ],
- index_where = SourceDomain.removed_at == None,
- set_ = {
- "updated_at" : sqlmodel.func.current_timestamp(),
- }
- )
+ "name" : domain,
+ }
+ for domain in domains
)
- self.backend.db.execute(stmt)
+
+ # Submit domains in batches of 1000 values
+ for values in itertools.batched(domains, 1000):
+ stmt = (
+ sqlalchemy.dialects.postgresql
+ .insert(
+ SourceDomain,
+ )
+ .values(values)
+ .on_conflict_do_update(
+ index_elements = [
+ SourceDomain.source_id, SourceDomain.name,
+ ],
+ index_where = SourceDomain.removed_at == None,
+ set_ = {
+ "updated_at" : sqlmodel.func.current_timestamp(),
+ }
+ )
+ )
+ self.backend.db.execute(stmt)
def __prune(self):
"""