From: Michael Tremer Date: Wed, 10 Dec 2025 17:04:23 +0000 (+0000) Subject: sources: Support transparent decompression of compressed files X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=48754e6ebaf63094f7bdd792f17ccab4d3d428bb;p=dbl.git sources: Support transparent decompression of compressed files Signed-off-by: Michael Tremer --- diff --git a/src/dnsbl/sources.py b/src/dnsbl/sources.py index e1f476d..474e9a7 100644 --- a/src/dnsbl/sources.py +++ b/src/dnsbl/sources.py @@ -21,6 +21,7 @@ import datetime import email.utils import enum +import gzip import io import logging import sqlalchemy.dialects.postgresql @@ -179,17 +180,11 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): log.debug("Source %s has not been changed, skipping processing" % self) return False - buffer = io.StringIO() - - # Read the entire payload into the buffer - for chunk in response.iter_text(): - buffer.write(chunk) - - # Rewind the buffer - buffer.seek(0) + # Consume, transparently decompress and decode the payload + f = self._consume_payload(response) # Add all domains - for line in buffer: + for line in f: line = line.rstrip() # Detect the format if still unknown @@ -264,6 +259,38 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True): # Store the ETag self.etag = headers.get("ETag") + def _consume_payload(self, response): + """ + Transparently decompresses any data + """ + # Create a new buffer to download the entire payload into + f = io.BytesIO() + + # Read the entire payload into the buffer + for chunk in response.iter_bytes(): + f.write(chunk) + + # Rewind the buffer + f.seek(0) + + # Read some magic bytes + magic = f.read(2) + + # Reset the buffer again + f.seek(0) + + # If the file is gzip-compressed, we decompress on the fly + if magic == b"\x1f\x8b": + log.debug("The payload seems to be gzip-compressed") + + # Decompress gzip + f = gzip.GzipFile(fileobj=f, mode="rb") + + # Convert into UTF-8 + f = io.TextIOWrapper(f, encoding="utf-8") + + return f + def _detect_format(self, line): """ Called very early when we are detecting the format