sources: Support transparent decompression of compressed files

author Michael Tremer <michael.tremer@ipfire.org>

Wed, 10 Dec 2025 17:04:23 +0000 (17:04 +0000)

committer Michael Tremer <michael.tremer@ipfire.org>

Wed, 10 Dec 2025 17:04:23 +0000 (17:04 +0000)
author Michael Tremer <michael.tremer@ipfire.org>
Wed, 10 Dec 2025 17:04:23 +0000 (17:04 +0000)
committer Michael Tremer <michael.tremer@ipfire.org>
Wed, 10 Dec 2025 17:04:23 +0000 (17:04 +0000)
diff --git a/src/dnsbl/sources.py b/src/dnsbl/sources.py

index e1f476d52fce4940bc6a60e1f838fbb31b848bbe..474e9a7dc46cd7f50d628ac4b11e9b5314f8ecd3 100644 (file)
--- a/src/dnsbl/sources.py
+++ b/src/dnsbl/sources.py
@@ -21,6 +21,7 @@
  import datetime
  import email.utils
  import enum
+import gzip
  import io
  import logging
  import sqlalchemy.dialects.postgresql
@@ -179,17 +180,11 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                                                 log.debug("Source %s has not been changed, skipping processing" % self)
                                                 return False
  
-                                       buffer = io.StringIO()
-
-                                       # Read the entire payload into the buffer
-                                       for chunk in response.iter_text():
-                                               buffer.write(chunk)
-
-                                       # Rewind the buffer
-                                       buffer.seek(0)
+                                       # Consume, transparently decompress and decode the payload
+                                       f = self._consume_payload(response)
  
                                         # Add all domains
-                                       for line in buffer:
+                                       for line in f:
                                                 line = line.rstrip()
  
                                                 # Detect the format if still unknown
@@ -264,6 +259,38 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                 # Store the ETag
                 self.etag = headers.get("ETag")
  
+       def _consume_payload(self, response):
+               """
+                       Transparently decompresses any data
+               """
+               # Create a new buffer to download the entire payload into
+               f = io.BytesIO()
+
+               # Read the entire payload into the buffer
+               for chunk in response.iter_bytes():
+                       f.write(chunk)
+
+               # Rewind the buffer
+               f.seek(0)
+
+               # Read some magic bytes
+               magic = f.read(2)
+
+               # Reset the buffer again
+               f.seek(0)
+
+               # If the file is gzip-compressed, we decompress on the fly
+               if magic == b"\x1f\x8b":
+                       log.debug("The payload seems to be gzip-compressed")
+
+                       # Decompress gzip
+                       f = gzip.GzipFile(fileobj=f, mode="rb")
+
+               # Convert into UTF-8
+               f = io.TextIOWrapper(f, encoding="utf-8")
+
+               return f
+
         def _detect_format(self, line):
                 """
                         Called very early when we are detecting the format
author	Michael Tremer <michael.tremer@ipfire.org>
	Wed, 10 Dec 2025 17:04:23 +0000 (17:04 +0000)
committer	Michael Tremer <michael.tremer@ipfire.org>
	Wed, 10 Dec 2025 17:04:23 +0000 (17:04 +0000)