From: Michael Tremer Date: Sat, 6 Dec 2025 17:10:14 +0000 (+0000) Subject: sources: Only download when upstream has actually changed X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=51e5629422d4c3f6aad741417921321d7cecd13f;p=dbl.git sources: Only download when upstream has actually changed Signed-off-by: Michael Tremer --- diff --git a/src/dnsbl/sources.py b/src/dnsbl/sources.py index 00db4a1..1def5d7 100644 --- a/src/dnsbl/sources.py +++ b/src/dnsbl/sources.py @@ -19,6 +19,7 @@ ############################################################################### import datetime +import email.utils import logging import sqlalchemy.dialects.postgresql import sqlmodel @@ -98,6 +99,12 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): # List list : "List" = sqlmodel.Relationship(back_populates="sources") + # Last Modified At + last_modified_at : datetime.datetime | None + + # ETag + etag : str | None + # Domains domains : "SourceDomain" = sqlmodel.Relationship(back_populates="source") @@ -107,11 +114,20 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): """ log.debug("%s: Updating source %s" % (self.list, self)) - # XXX We should stored Etag or the last modified timestamp - with self.db.transaction(): with self.backend.client() as client: - with client.stream("GET", self.url) as response: + # Compose some request headers + headers = self._make_headers() + + with client.stream("GET", self.url, headers=headers) as response: + # Parse the response headers + self._parse_headers(response.headers) + + # There is nothing to do if the source has not changed + if response.status_code == 304: + log.debug("Source %s has not been changed, skipping processing" % self) + return + # Add all domains for line in response.iter_lines(): try: @@ -123,6 +139,35 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): # Mark all domains that have not been updated as removed self.__prune() + def _make_headers(self): + """ + Creates some headers we will send with the request. + """ + headers = {} + + # Send If-Modified-Since so that we won't re-import the same list + if self.last_modified_at: + headers["If-Modified-Since"] = \ + self.last_modified_at.strftime("%a, %d %b %Y %H:%M:%S GMT") + + # If we don't have the timestamp, we will send the ETag + elif self.etag: + headers["ETag"] = self.etag + + return headers + + def _parse_headers(self, headers): + """ + Parses the response headers. + """ + # Store Last-Modified + last_modified = headers.get("Last-Modified") + if last_modified: + self.last_modified_at = email.utils.parsedate_to_datetime(last_modified) + + # Store the ETag + self.etag = headers.get("ETag") + def add_domain(self, name): """ Adds or updates a domain.