]> git.ipfire.org Git - dbl.git/commitdiff
sources: Support transparent decompression of compressed files
authorMichael Tremer <michael.tremer@ipfire.org>
Wed, 10 Dec 2025 17:04:23 +0000 (17:04 +0000)
committerMichael Tremer <michael.tremer@ipfire.org>
Wed, 10 Dec 2025 17:04:23 +0000 (17:04 +0000)
Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
src/dnsbl/sources.py

index e1f476d52fce4940bc6a60e1f838fbb31b848bbe..474e9a7dc46cd7f50d628ac4b11e9b5314f8ecd3 100644 (file)
@@ -21,6 +21,7 @@
 import datetime
 import email.utils
 import enum
+import gzip
 import io
 import logging
 import sqlalchemy.dialects.postgresql
@@ -179,17 +180,11 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                                                log.debug("Source %s has not been changed, skipping processing" % self)
                                                return False
 
-                                       buffer = io.StringIO()
-
-                                       # Read the entire payload into the buffer
-                                       for chunk in response.iter_text():
-                                               buffer.write(chunk)
-
-                                       # Rewind the buffer
-                                       buffer.seek(0)
+                                       # Consume, transparently decompress and decode the payload
+                                       f = self._consume_payload(response)
 
                                        # Add all domains
-                                       for line in buffer:
+                                       for line in f:
                                                line = line.rstrip()
 
                                                # Detect the format if still unknown
@@ -264,6 +259,38 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                # Store the ETag
                self.etag = headers.get("ETag")
 
+       def _consume_payload(self, response):
+               """
+                       Transparently decompresses any data
+               """
+               # Create a new buffer to download the entire payload into
+               f = io.BytesIO()
+
+               # Read the entire payload into the buffer
+               for chunk in response.iter_bytes():
+                       f.write(chunk)
+
+               # Rewind the buffer
+               f.seek(0)
+
+               # Read some magic bytes
+               magic = f.read(2)
+
+               # Reset the buffer again
+               f.seek(0)
+
+               # If the file is gzip-compressed, we decompress on the fly
+               if magic == b"\x1f\x8b":
+                       log.debug("The payload seems to be gzip-compressed")
+
+                       # Decompress gzip
+                       f = gzip.GzipFile(fileobj=f, mode="rb")
+
+               # Convert into UTF-8
+               f = io.TextIOWrapper(f, encoding="utf-8")
+
+               return f
+
        def _detect_format(self, line):
                """
                        Called very early when we are detecting the format