###############################################################################
import datetime
+import email.utils
import logging
import sqlalchemy.dialects.postgresql
import sqlmodel
# List
list : "List" = sqlmodel.Relationship(back_populates="sources")
+ # Last Modified At
+ last_modified_at : datetime.datetime | None
+
+ # ETag
+ etag : str | None
+
# Domains
domains : "SourceDomain" = sqlmodel.Relationship(back_populates="source")
"""
log.debug("%s: Updating source %s" % (self.list, self))
- # XXX We should stored Etag or the last modified timestamp
-
with self.db.transaction():
with self.backend.client() as client:
- with client.stream("GET", self.url) as response:
+ # Compose some request headers
+ headers = self._make_headers()
+
+ with client.stream("GET", self.url, headers=headers) as response:
+ # Parse the response headers
+ self._parse_headers(response.headers)
+
+ # There is nothing to do if the source has not changed
+ if response.status_code == 304:
+ log.debug("Source %s has not been changed, skipping processing" % self)
+ return
+
# Add all domains
for line in response.iter_lines():
try:
# Mark all domains that have not been updated as removed
self.__prune()
+ def _make_headers(self):
+ """
+ Creates some headers we will send with the request.
+ """
+ headers = {}
+
+ # Send If-Modified-Since so that we won't re-import the same list
+ if self.last_modified_at:
+ headers["If-Modified-Since"] = \
+ self.last_modified_at.strftime("%a, %d %b %Y %H:%M:%S GMT")
+
+ # If we don't have the timestamp, we will send the ETag
+ elif self.etag:
+ headers["ETag"] = self.etag
+
+ return headers
+
+ def _parse_headers(self, headers):
+ """
+ Parses the response headers.
+ """
+ # Store Last-Modified
+ last_modified = headers.get("Last-Modified")
+ if last_modified:
+ self.last_modified_at = email.utils.parsedate_to_datetime(last_modified)
+
+ # Store the ETag
+ self.etag = headers.get("ETag")
+
def add_domain(self, name):
"""
Adds or updates a domain.