import datetime
import email.utils
import enum
+import gzip
import io
import logging
import sqlalchemy.dialects.postgresql
log.debug("Source %s has not been changed, skipping processing" % self)
return False
- buffer = io.StringIO()
-
- # Read the entire payload into the buffer
- for chunk in response.iter_text():
- buffer.write(chunk)
-
- # Rewind the buffer
- buffer.seek(0)
+ # Consume, transparently decompress and decode the payload
+ f = self._consume_payload(response)
# Add all domains
- for line in buffer:
+ for line in f:
line = line.rstrip()
# Detect the format if still unknown
# Store the ETag
self.etag = headers.get("ETag")
+ def _consume_payload(self, response):
+ """
+ Transparently decompresses any data
+ """
+ # Create a new buffer to download the entire payload into
+ f = io.BytesIO()
+
+ # Read the entire payload into the buffer
+ for chunk in response.iter_bytes():
+ f.write(chunk)
+
+ # Rewind the buffer
+ f.seek(0)
+
+ # Read some magic bytes
+ magic = f.read(2)
+
+ # Reset the buffer again
+ f.seek(0)
+
+ # If the file is gzip-compressed, we decompress on the fly
+ if magic == b"\x1f\x8b":
+ log.debug("The payload seems to be gzip-compressed")
+
+ # Decompress gzip
+ f = gzip.GzipFile(fileobj=f, mode="rb")
+
+ # Convert into UTF-8
+ f = io.TextIOWrapper(f, encoding="utf-8")
+
+ return f
+
def _detect_format(self, line):
"""
Called very early when we are detecting the format