importer: Move the split functions into the main importer

[people/ms/libloc.git] / src / python / location / importer.py
diff --git a/src/python/location/importer.py b/src/python/location/importer.py

index dee36ed9dfacb2c6403ec4d53d30cc122c784ec1..58ec3686f74032e6947b6eb0ceb544e78f920554 100644 (file)
--- a/src/python/location/importer.py
+++ b/src/python/location/importer.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python3
  ###############################################################################
  #                                                                             #
  # libloc - A library to determine the location of someone on the Internet     #
@@ -19,82 +18,13 @@
  
  import gzip
  import logging
+import tempfile
  import urllib.request
  
  # Initialise logging
  log = logging.getLogger("location.importer")
  log.propagate = 1
  
-WHOIS_SOURCES = {
-       # African Network Information Centre
-       "AFRINIC": [
-               "https://ftp.afrinic.net/pub/pub/dbase/afrinic.db.gz"
-               ],
-
-       # Asia Pacific Network Information Centre
-       "APNIC": [
-               "https://ftp.apnic.net/apnic/whois/apnic.db.inet6num.gz",
-               "https://ftp.apnic.net/apnic/whois/apnic.db.inetnum.gz",
-               #"https://ftp.apnic.net/apnic/whois/apnic.db.route6.gz",
-               #"https://ftp.apnic.net/apnic/whois/apnic.db.route.gz",
-               "https://ftp.apnic.net/apnic/whois/apnic.db.aut-num.gz",
-               "https://ftp.apnic.net/apnic/whois/apnic.db.organisation.gz"
-               ],
-
-       # American Registry for Internet Numbers
-       # XXX there is nothing useful for us in here
-       # ARIN: [
-       #       "https://ftp.arin.net/pub/rr/arin.db"
-       # ],
-
-       # Japan Network Information Center
-       "JPNIC": [
-               "https://ftp.nic.ad.jp/jpirr/jpirr.db.gz"
-               ],
-
-       # Latin America and Caribbean Network Information Centre
-       "LACNIC": [
-               "https://ftp.lacnic.net/lacnic/dbase/lacnic.db.gz"
-               ],
-
-       # Réseaux IP Européens
-       "RIPE": [
-               "https://ftp.ripe.net/ripe/dbase/split/ripe.db.inet6num.gz",
-               "https://ftp.ripe.net/ripe/dbase/split/ripe.db.inetnum.gz",
-               #"https://ftp.ripe.net/ripe/dbase/split/ripe.db.route6.gz",
-               #"https://ftp.ripe.net/ripe/dbase/split/ripe.db.route.gz",
-               "https://ftp.ripe.net/ripe/dbase/split/ripe.db.aut-num.gz",
-               "https://ftp.ripe.net/ripe/dbase/split/ripe.db.organisation.gz"
-               ],
-}
-
-EXTENDED_SOURCES = {
-       # African Network Information Centre
-       # "ARIN": [
-       #       "https://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-extended-latest"
-       # ],
-
-       # Asia Pacific Network Information Centre
-       # "APNIC": [
-       #       "https://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-extended-latest"
-       # ],
-
-       # American Registry for Internet Numbers
-       "ARIN": [
-               "https://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest"
-               ],
-
-       # Latin America and Caribbean Network Information Centre
-       "LACNIC": [
-               "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
-               ],
-
-       # Réseaux IP Européens
-       # "RIPE": [
-       #       "https://ftp.ripe.net/pub/stats/ripencc/delegated-ripencc-extended-latest"
-       # ],
-}
-
  class Downloader(object):
         def __init__(self):
                 self.proxy = None
@@ -106,145 +36,66 @@ class Downloader(object):
                 log.info("Using proxy %s" % url)
                 self.proxy = url
  
-       def request(self, url, data=None, return_blocks=False):
-               req = urllib.request.Request(url, data=data)
+       def retrieve(self, url, **kwargs):
+               """
+                       This method will fetch the content at the given URL
+                       and will return a file-object to a temporary file.
+
+                       If the content was compressed, it will be decompressed on the fly.
+               """
+               # Open a temporary file to buffer the downloaded content
+               t = tempfile.SpooledTemporaryFile(max_size=100 * 1024 * 1024)
+
+               # Create a new request
+               req = urllib.request.Request(url, **kwargs)
  
                 # Configure proxy
                 if self.proxy:
                         req.set_proxy(self.proxy, "http")
  
-               return DownloaderContext(self, req, return_blocks=return_blocks)
-
-
-class DownloaderContext(object):
-       def __init__(self, downloader, request, return_blocks=False):
-               self.downloader = downloader
-               self.request = request
-
-               # Should we return one block or a single line?
-               self.return_blocks = return_blocks
-
-               # Save the response object
-               self.response = None
-
-       def __enter__(self):
-               log.info("Retrieving %s..." % self.request.full_url)
+               log.info("Retrieving %s..." % req.full_url)
  
                 # Send request
-               self.response = urllib.request.urlopen(self.request)
+               res = urllib.request.urlopen(req)
  
                 # Log the response headers
                 log.debug("Response Headers:")
-               for header in self.headers:
-                       log.debug("     %s: %s" % (header, self.get_header(header)))
-
-               return self
-
-       def __exit__(self, type, value, traceback):
-               pass
+               for header in res.headers:
+                       log.debug("     %s: %s" % (header, res.headers[header]))
  
-       def __iter__(self):
-               """
-                       Makes the object iterable by going through each block
-               """
-               if self.return_blocks:
-                       return iterate_over_blocks(self.body)
+               # Write the payload to the temporary file
+               with res as f:
+                       while True:
+                               buf = f.read(65536)
+                               if not buf:
+                                       break
  
-               return iterate_over_lines(self.body)
+                               t.write(buf)
  
-       @property
-       def headers(self):
-               if self.response:
-                       return self.response.headers
+               # Rewind the temporary file
+               t.seek(0)
  
-       def get_header(self, name):
-               if self.headers:
-                       return self.headers.get(name)
+               gzip_compressed = False
  
-       @property
-       def body(self):
-               """
-                       Returns a file-like object with the decoded content
-                       of the response.
-               """
-               content_type = self.get_header("Content-Type")
+               # Fetch the content type
+               content_type = res.headers.get("Content-Type")
  
                 # Decompress any gzipped response on the fly
                 if content_type in ("application/x-gzip", "application/gzip"):
-                       return gzip.GzipFile(fileobj=self.response, mode="rb")
-
-               # Return the response by default
-               return self.response
-
-
-def read_blocks(f):
-       for block in iterate_over_blocks(f):
-               type = None
-               data = {}
-
-               for i, line in enumerate(block):
-                       key, value = line.split(":", 1)
-
-                       # The key of the first line defines the type
-                       if i == 0:
-                               type = key
-
-                       # Store value
-                       data[key] = value.strip()
-
-               yield type, data
-
-def iterate_over_blocks(f, charsets=("utf-8", "latin1")):
-       block = []
-
-       for line in f:
-               # Convert to string
-               for charset in charsets:
-                       try:
-                               line = line.decode(charset)
-                       except UnicodeDecodeError:
-                               continue
-                       else:
-                               break
-
-               # Skip commented lines
-               if line.startswith("#") or line.startswith("%"):
-                       continue
-
-               # Strip line-endings
-               line = line.rstrip()
-
-               # Remove any comments at the end of line
-               line, hash, comment = line.partition("#")
-
-               if comment:
-                       # Strip any whitespace before the comment
-                       line = line.rstrip()
-
-                       # If the line is now empty, we move on
-                       if not line:
-                               continue
-
-               if line:
-                       block.append(line)
-                       continue
-
-               # End the block on an empty line
-               if block:
-                       yield block
+                       gzip_compressed = True
  
-               # Reset the block
-               block = []
+               # Check for the gzip magic in case web servers send a different MIME type
+               elif t.read(2) == b"\x1f\x8b":
+                       gzip_compressed = True
  
-       # Return the last block
-       if block:
-               yield block
+               # Reset again
+               t.seek(0)
  
+               # Decompress the temporary file
+               if gzip_compressed:
+                       log.debug("Gzip compression detected")
  
-def iterate_over_lines(f):
-       for line in f:
-               # Decode the line
-               line = line.decode()
+                       t = gzip.GzipFile(fileobj=t, mode="rb")
  
-               # Strip the ending
-               yield line.rstrip()
+               # Return the temporary file handle
+               return t