]> git.ipfire.org Git - location/location-database.git/blob - tools/downloader.py
downloader: Only return blocks with content in them
[location/location-database.git] / tools / downloader.py
1 #!/usr/bin/python3
2 ###############################################################################
3 # #
4 # location-database - A database to determine someone's #
5 # location on the Internet #
6 # Copyright (C) 2018 Michael Tremer #
7 # #
8 # This program is free software: you can redistribute it and/or modify #
9 # it under the terms of the GNU General Public License as published by #
10 # the Free Software Foundation, either version 3 of the License, or #
11 # (at your option) any later version. #
12 # #
13 # This program is distributed in the hope that it will be useful, #
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
16 # GNU General Public License for more details. #
17 # #
18 # You should have received a copy of the GNU General Public License #
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. #
20 # #
21 ###############################################################################
22
23 import gzip
24 import logging
25 import urllib.request
26
27 # Setup logger
28 log = logging.getLogger("downloader")
29 log.propagate = 1
30
31 class Downloader(object):
32 USER_AGENT = "location-database/1.0"
33
34 def __init__(self):
35 self.proxy = None
36
37 def set_proxy(self, url):
38 """
39 Sets a HTTP proxy that is used to perform all requests
40 """
41 log.info("Using proxy %s" % url)
42 self.proxy = url
43
44 def request(self, url, data=None):
45 req = urllib.request.Request(url, data=data)
46
47 # Configure proxy
48 if self.proxy:
49 req.set_proxy(self.proxy, "http")
50
51 # Set User-Agent
52 if self.USER_AGENT:
53 req.add_header("User-Agent", self.USER_AGENT)
54
55 return DownloaderContext(self, req)
56
57
58 class DownloaderContext(object):
59 def __init__(self, downloader, request):
60 self.downloader = downloader
61 self.request = request
62
63 # Save the response object
64 self.response = None
65
66 def __enter__(self):
67 log.info("Retrieving %s..." % self.request.full_url)
68
69 # Send request
70 self.response = urllib.request.urlopen(self.request)
71
72 # Log the response headers
73 log.debug("Response Headers:")
74 for header in self.headers:
75 log.debug(" %s: %s" % (header, self.get_header(header)))
76
77 return self
78
79 def __exit__(self, type, value, traceback):
80 pass
81
82 def __iter__(self):
83 """
84 Makes the object iterable by going through each block
85 """
86 block = []
87
88 for line in self.body:
89 # Convert to string
90 for charset in ("utf-8", "latin1"):
91 try:
92 line = line.decode(charset)
93 except UnicodeDecodeError:
94 continue
95 else:
96 break
97
98 # Strip line-endings
99 line = line.rstrip()
100
101 # Skip commented lines
102 if line.startswith("#"):
103 continue
104
105 if line:
106 block.append(line)
107 continue
108
109 # End the block on an empty line
110 if block:
111 yield block
112
113 # Reset the block
114 block = []
115
116 @property
117 def headers(self):
118 if self.response:
119 return self.response.headers
120
121 def get_header(self, name):
122 if self.headers:
123 return self.headers.get(name)
124
125 @property
126 def body(self):
127 """
128 Returns a file-like object with the decoded content
129 of the response.
130 """
131 content_type = self.get_header("Content-Type")
132
133 # Decompress any gzipped response on the fly
134 if content_type in ("application/x-gzip", "application/gzip"):
135 return gzip.GzipFile(fileobj=self.response, mode="rb")
136
137 # Return the response by default
138 return self.response
139
140
141 if __name__ == "__main__":
142 import sys
143
144 # Enable debug logging
145 logging.basicConfig(level=logging.DEBUG)
146
147 d = Downloader()
148
149 for url in sys.argv[1:]:
150 print("Downloading %s..." % url)
151
152 with d.request(url) as r:
153 for block in r:
154 for line in block:
155 print(line)
156 print()