]> git.ipfire.org Git - dnsbl.git/commitdiff
sources: Only download when upstream has actually changed
authorMichael Tremer <michael.tremer@ipfire.org>
Sat, 6 Dec 2025 17:10:14 +0000 (17:10 +0000)
committerMichael Tremer <michael.tremer@ipfire.org>
Sat, 6 Dec 2025 17:10:14 +0000 (17:10 +0000)
Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
src/dnsbl/sources.py

index 00db4a12645ecf4b4a90d1dc63c89b5940711344..1def5d76e238d6b04c4109e8f7db37e2a5869e39 100644 (file)
@@ -19,6 +19,7 @@
 ###############################################################################
 
 import datetime
+import email.utils
 import logging
 import sqlalchemy.dialects.postgresql
 import sqlmodel
@@ -98,6 +99,12 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
        # List
        list : "List" = sqlmodel.Relationship(back_populates="sources")
 
+       # Last Modified At
+       last_modified_at : datetime.datetime | None
+
+       # ETag
+       etag : str | None
+
        # Domains
        domains : "SourceDomain" = sqlmodel.Relationship(back_populates="source")
 
@@ -107,11 +114,20 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                """
                log.debug("%s: Updating source %s" % (self.list, self))
 
-               # XXX We should stored Etag or the last modified timestamp
-
                with self.db.transaction():
                        with self.backend.client() as client:
-                               with client.stream("GET", self.url) as response:
+                               # Compose some request headers
+                               headers = self._make_headers()
+
+                               with client.stream("GET", self.url, headers=headers) as response:
+                                       # Parse the response headers
+                                       self._parse_headers(response.headers)
+
+                                       # There is nothing to do if the source has not changed
+                                       if response.status_code == 304:
+                                               log.debug("Source %s has not been changed, skipping processing" % self)
+                                               return
+
                                        # Add all domains
                                        for line in response.iter_lines():
                                                try:
@@ -123,6 +139,35 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                        # Mark all domains that have not been updated as removed
                        self.__prune()
 
+       def _make_headers(self):
+               """
+                       Creates some headers we will send with the request.
+               """
+               headers = {}
+
+               # Send If-Modified-Since so that we won't re-import the same list
+               if self.last_modified_at:
+                       headers["If-Modified-Since"] = \
+                               self.last_modified_at.strftime("%a, %d %b %Y %H:%M:%S GMT")
+
+               # If we don't have the timestamp, we will send the ETag
+               elif self.etag:
+                       headers["ETag"] = self.etag
+
+               return headers
+
+       def _parse_headers(self, headers):
+               """
+                       Parses the response headers.
+               """
+               # Store Last-Modified
+               last_modified = headers.get("Last-Modified")
+               if last_modified:
+                       self.last_modified_at = email.utils.parsedate_to_datetime(last_modified)
+
+               # Store the ETag
+               self.etag = headers.get("ETag")
+
        def add_domain(self, name):
                """
                        Adds or updates a domain.