]>
Commit | Line | Data |
---|---|---|
4fa42d3f MT |
1 | diff --git a/.gitignore b/.gitignore |
2 | new file mode 100644 | |
3 | index 0000000..1ffe416 | |
4 | --- /dev/null | |
5 | +++ b/.gitignore | |
6 | @@ -0,0 +1,7 @@ | |
7 | +*.py[co] | |
8 | +MANIFEST | |
9 | +dist | |
10 | +build | |
11 | +*.kdev* | |
12 | +*.kateproject | |
13 | +ipython.log* | |
f79f24fe MT |
14 | diff --git a/scripts/urlgrabber b/scripts/urlgrabber |
15 | index 518e512..09cd896 100644 | |
16 | --- a/scripts/urlgrabber | |
17 | +++ b/scripts/urlgrabber | |
18 | @@ -115,6 +115,7 @@ options: | |
19 | including quotes in the case of strings. | |
20 | e.g. --user_agent='"foobar/2.0"' | |
21 | ||
22 | + --output FILE | |
23 | -o FILE write output to FILE, otherwise the basename of the | |
24 | url will be used | |
25 | -O print the names of saved files to STDOUT | |
26 | @@ -170,12 +171,17 @@ class client_options: | |
27 | return ug_options, ug_defaults | |
28 | ||
29 | def process_command_line(self): | |
30 | - short_options = 'vd:hoOpD' | |
31 | + short_options = 'vd:ho:OpD' | |
32 | long_options = ['profile', 'repeat=', 'verbose=', | |
33 | - 'debug=', 'help', 'progress'] | |
34 | + 'debug=', 'help', 'progress', 'output='] | |
35 | ug_long = [ o + '=' for o in self.ug_options ] | |
36 | - optlist, args = getopt.getopt(sys.argv[1:], short_options, | |
37 | - long_options + ug_long) | |
38 | + try: | |
39 | + optlist, args = getopt.getopt(sys.argv[1:], short_options, | |
40 | + long_options + ug_long) | |
41 | + except getopt.GetoptError, e: | |
42 | + print >>sys.stderr, "Error:", e | |
43 | + self.help([], ret=1) | |
44 | + | |
45 | self.verbose = 0 | |
46 | self.debug = None | |
47 | self.outputfile = None | |
48 | @@ -193,6 +199,7 @@ class client_options: | |
49 | if o == '--verbose': self.verbose = v | |
50 | if o == '-v': self.verbose += 1 | |
51 | if o == '-o': self.outputfile = v | |
52 | + if o == '--output': self.outputfile = v | |
53 | if o == '-p' or o == '--progress': self.progress = 1 | |
54 | if o == '-d' or o == '--debug': self.debug = v | |
55 | if o == '--profile': self.profile = 1 | |
56 | @@ -222,7 +229,7 @@ class client_options: | |
57 | print "ERROR: cannot use -o when grabbing multiple files" | |
58 | sys.exit(1) | |
59 | ||
60 | - def help(self, args): | |
61 | + def help(self, args, ret=0): | |
62 | if not args: | |
63 | print MAINHELP | |
64 | else: | |
65 | @@ -234,7 +241,7 @@ class client_options: | |
66 | self.help_ug_option(a) | |
67 | else: | |
68 | print 'ERROR: no help on command "%s"' % a | |
69 | - sys.exit(0) | |
70 | + sys.exit(ret) | |
71 | ||
72 | def help_doc(self): | |
73 | print __doc__ | |
4fa42d3f MT |
74 | diff --git a/scripts/urlgrabber-ext-down b/scripts/urlgrabber-ext-down |
75 | new file mode 100755 | |
76 | index 0000000..3dafb12 | |
77 | --- /dev/null | |
78 | +++ b/scripts/urlgrabber-ext-down | |
79 | @@ -0,0 +1,75 @@ | |
80 | +#! /usr/bin/python | |
81 | +# A very simple external downloader | |
82 | +# Copyright 2011-2012 Zdenek Pavlas | |
83 | + | |
84 | +# This library is free software; you can redistribute it and/or | |
85 | +# modify it under the terms of the GNU Lesser General Public | |
86 | +# License as published by the Free Software Foundation; either | |
87 | +# version 2.1 of the License, or (at your option) any later version. | |
88 | +# | |
89 | +# This library is distributed in the hope that it will be useful, | |
90 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
91 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
92 | +# Lesser General Public License for more details. | |
93 | +# | |
94 | +# You should have received a copy of the GNU Lesser General Public | |
95 | +# License along with this library; if not, write to the | |
96 | +# Free Software Foundation, Inc., | |
97 | +# 59 Temple Place, Suite 330, | |
98 | +# Boston, MA 02111-1307 USA | |
99 | + | |
100 | +import time, os, errno, sys | |
101 | +from urlgrabber.grabber import \ | |
102 | + _readlines, URLGrabberOptions, _loads, \ | |
103 | + PyCurlFileObject, URLGrabError | |
104 | + | |
105 | +def write(fmt, *arg): | |
106 | + try: os.write(1, fmt % arg) | |
107 | + except OSError, e: | |
108 | + if e.args[0] != errno.EPIPE: raise | |
109 | + sys.exit(1) | |
110 | + | |
111 | +class ProxyProgress: | |
112 | + def start(self, *d1, **d2): | |
113 | + self.next_update = 0 | |
114 | + def update(self, _amount_read): | |
115 | + t = time.time() | |
116 | + if t < self.next_update: return | |
117 | + self.next_update = t + 0.31 | |
118 | + write('%d %d\n', self._id, _amount_read) | |
119 | + | |
120 | +def main(): | |
121 | + import signal | |
122 | + signal.signal(signal.SIGINT, lambda n, f: sys.exit(1)) | |
123 | + cnt = 0 | |
124 | + while True: | |
125 | + lines = _readlines(0) | |
126 | + if not lines: break | |
127 | + for line in lines: | |
128 | + cnt += 1 | |
129 | + opts = URLGrabberOptions() | |
130 | + opts._id = cnt | |
131 | + for k in line.split(' '): | |
132 | + k, v = k.split('=', 1) | |
133 | + setattr(opts, k, _loads(v)) | |
134 | + if opts.progress_obj: | |
135 | + opts.progress_obj = ProxyProgress() | |
136 | + opts.progress_obj._id = cnt | |
137 | + | |
138 | + dlsz = dltm = 0 | |
139 | + try: | |
140 | + fo = PyCurlFileObject(opts.url, opts.filename, opts) | |
141 | + fo._do_grab() | |
142 | + fo.fo.close() | |
143 | + size = fo._amount_read | |
144 | + if fo._tm_last: | |
145 | + dlsz = fo._tm_last[0] - fo._tm_first[0] | |
146 | + dltm = fo._tm_last[1] - fo._tm_first[1] | |
147 | + ug_err = 'OK' | |
148 | + except URLGrabError, e: | |
149 | + size = 0 | |
150 | + ug_err = '%d %s' % e.args | |
151 | + write('%d %d %d %.3f %s\n', opts._id, size, dlsz, dltm, ug_err) | |
152 | + | |
153 | +if __name__ == '__main__': | |
154 | + main() | |
155 | diff --git a/setup.py b/setup.py | |
156 | index d0b87b8..bfa4a18 100644 | |
157 | --- a/setup.py | |
158 | +++ b/setup.py | |
159 | @@ -15,8 +15,10 @@ url = _urlgrabber.__url__ | |
160 | packages = ['urlgrabber'] | |
161 | package_dir = {'urlgrabber':'urlgrabber'} | |
162 | scripts = ['scripts/urlgrabber'] | |
163 | -data_files = [('share/doc/' + name + '-' + version, | |
164 | - ['README','LICENSE', 'TODO', 'ChangeLog'])] | |
165 | +data_files = [ | |
166 | + ('share/doc/' + name + '-' + version, ['README','LICENSE', 'TODO', 'ChangeLog']), | |
167 | + ('libexec', ['scripts/urlgrabber-ext-down']), | |
168 | +] | |
169 | options = { 'clean' : { 'all' : 1 } } | |
170 | classifiers = [ | |
171 | 'Development Status :: 4 - Beta', | |
f79f24fe MT |
172 | diff --git a/test/base_test_code.py b/test/base_test_code.py |
173 | index 50c6348..5fb43f9 100644 | |
174 | --- a/test/base_test_code.py | |
175 | +++ b/test/base_test_code.py | |
176 | @@ -1,6 +1,6 @@ | |
177 | from munittest import * | |
178 | ||
179 | -base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/' | |
180 | +base_http = 'http://urlgrabber.baseurl.org/test/' | |
181 | base_ftp = 'ftp://localhost/test/' | |
182 | ||
183 | # set to a proftp server only. we're working around a couple of | |
184 | diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py | |
185 | index 3e5f3b7..8eeaeda 100644 | |
186 | --- a/urlgrabber/byterange.py | |
187 | +++ b/urlgrabber/byterange.py | |
188 | @@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler): | |
189 | ||
190 | def http_error_416(self, req, fp, code, msg, hdrs): | |
191 | # HTTP's Range Not Satisfiable error | |
192 | - raise RangeError('Requested Range Not Satisfiable') | |
193 | + raise RangeError(9, 'Requested Range Not Satisfiable') | |
194 | ||
195 | class HTTPSRangeHandler(HTTPRangeHandler): | |
196 | """ Range Header support for HTTPS. """ | |
197 | @@ -208,7 +208,7 @@ class RangeableFileObject: | |
198 | bufsize = offset - pos | |
199 | buf = self.fo.read(bufsize) | |
200 | if len(buf) != bufsize: | |
201 | - raise RangeError('Requested Range Not Satisfiable') | |
202 | + raise RangeError(9, 'Requested Range Not Satisfiable') | |
203 | pos+= bufsize | |
204 | ||
205 | class FileRangeHandler(urllib2.FileHandler): | |
206 | @@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler): | |
207 | (fb,lb) = brange | |
208 | if lb == '': lb = size | |
209 | if fb < 0 or fb > size or lb > size: | |
210 | - raise RangeError('Requested Range Not Satisfiable') | |
211 | + raise RangeError(9, 'Requested Range Not Satisfiable') | |
212 | size = (lb - fb) | |
213 | fo = RangeableFileObject(fo, (fb,lb)) | |
214 | headers = mimetools.Message(StringIO( | |
215 | @@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler): | |
216 | (fb,lb) = range_tup | |
217 | if lb == '': | |
218 | if retrlen is None or retrlen == 0: | |
219 | - raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') | |
220 | + raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.') | |
221 | lb = retrlen | |
222 | retrlen = lb - fb | |
223 | if retrlen < 0: | |
224 | # beginning of range is larger than file | |
225 | - raise RangeError('Requested Range Not Satisfiable') | |
226 | + raise RangeError(9, 'Requested Range Not Satisfiable') | |
227 | else: | |
228 | retrlen = lb - fb | |
229 | fp = RangeableFileObject(fp, (0,retrlen)) | |
230 | @@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup): | |
231 | # check if range is over the entire file | |
232 | if (fb,lb) == (0,''): return None | |
233 | # check that the range is valid | |
234 | - if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) | |
235 | + if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb)) | |
236 | return (fb,lb) | |
237 | ||
238 | diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py | |
4fa42d3f | 239 | index e090e90..6ce9861 100644 |
f79f24fe MT |
240 | --- a/urlgrabber/grabber.py |
241 | +++ b/urlgrabber/grabber.py | |
4fa42d3f MT |
242 | @@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs) |
243 | progress_obj = None | |
244 | ||
245 | a class instance that supports the following methods: | |
246 | - po.start(filename, url, basename, length, text) | |
247 | + po.start(filename, url, basename, size, now, text) | |
248 | # length will be None if unknown | |
249 | po.update(read) # read == bytes read so far | |
250 | po.end() | |
251 | ||
252 | + multi_progress_obj = None | |
253 | + | |
254 | + a class instance that supports the following methods: | |
255 | + mo.start(total_files, total_size) | |
256 | + mo.newMeter() => meter | |
257 | + mo.removeMeter(meter) | |
258 | + mo.end() | |
259 | + | |
260 | + The 'meter' object is similar to progress_obj, but multiple | |
261 | + instances may be created and updated at the same time. | |
262 | + | |
263 | + When downloading multiple files in parallel and multi_progress_obj | |
264 | + is None progress_obj is used in compatibility mode: finished files | |
265 | + are shown but there's no in-progress display. | |
266 | + | |
267 | text = None | |
268 | ||
269 | specifies alternative text to be passed to the progress meter | |
270 | @@ -68,14 +83,14 @@ GENERAL ARGUMENTS (kwargs) | |
f79f24fe MT |
271 | (which can be set on default_grabber.throttle) is used. See |
272 | BANDWIDTH THROTTLING for more information. | |
273 | ||
274 | - timeout = None | |
275 | + timeout = 300 | |
276 | ||
277 | - a positive float expressing the number of seconds to wait for socket | |
278 | - operations. If the value is None or 0.0, socket operations will block | |
279 | - forever. Setting this option causes urlgrabber to call the settimeout | |
280 | - method on the Socket object used for the request. See the Python | |
281 | - documentation on settimeout for more information. | |
282 | - http://www.python.org/doc/current/lib/socket-objects.html | |
283 | + a positive integer expressing the number of seconds to wait before | |
284 | + timing out attempts to connect to a server. If the value is None | |
285 | + or 0, connection attempts will not time out. The timeout is passed | |
286 | + to the underlying pycurl object as its CONNECTTIMEOUT option, see | |
287 | + the curl documentation on CURLOPT_CONNECTTIMEOUT for more information. | |
288 | + http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT | |
289 | ||
290 | bandwidth = 0 | |
291 | ||
4fa42d3f MT |
292 | @@ -143,8 +158,12 @@ GENERAL ARGUMENTS (kwargs) |
293 | note that proxy authentication information may be provided using | |
294 | normal URL constructs: | |
295 | proxies={ 'http' : 'http://user:host@foo:3128' } | |
296 | - Lastly, if proxies is None, the default environment settings will | |
297 | - be used. | |
298 | + | |
299 | + libproxy = False | |
300 | + | |
301 | + Use the libproxy module (if installed) to find proxies. | |
302 | + The libproxy code is only used if the proxies dictionary | |
303 | + does not provide any proxies. | |
304 | ||
305 | prefix = None | |
306 | ||
307 | @@ -198,6 +217,12 @@ GENERAL ARGUMENTS (kwargs) | |
f79f24fe MT |
308 | control, you should probably subclass URLParser and pass it in via |
309 | the 'urlparser' option. | |
310 | ||
311 | + username = None | |
312 | + username to use for simple http auth - is automatically quoted for special characters | |
313 | + | |
314 | + password = None | |
315 | + password to use for simple http auth - is automatically quoted for special characters | |
316 | + | |
317 | ssl_ca_cert = None | |
318 | ||
319 | this option can be used if M2Crypto is available and will be | |
4fa42d3f MT |
320 | @@ -211,43 +236,75 @@ GENERAL ARGUMENTS (kwargs) |
321 | No-op when using the curl backend (default) | |
322 | ||
323 | ||
324 | - self.ssl_verify_peer = True | |
325 | + ssl_verify_peer = True | |
326 | ||
327 | Check the server's certificate to make sure it is valid with what our CA validates | |
328 | ||
329 | - self.ssl_verify_host = True | |
330 | + ssl_verify_host = True | |
331 | ||
332 | Check the server's hostname to make sure it matches the certificate DN | |
333 | ||
334 | - self.ssl_key = None | |
335 | + ssl_key = None | |
336 | ||
337 | Path to the key the client should use to connect/authenticate with | |
338 | ||
339 | - self.ssl_key_type = 'PEM' | |
340 | + ssl_key_type = 'PEM' | |
341 | ||
342 | PEM or DER - format of key | |
343 | ||
344 | - self.ssl_cert = None | |
345 | + ssl_cert = None | |
346 | ||
347 | Path to the ssl certificate the client should use to to authenticate with | |
348 | ||
349 | - self.ssl_cert_type = 'PEM' | |
350 | + ssl_cert_type = 'PEM' | |
351 | ||
352 | PEM or DER - format of certificate | |
353 | ||
354 | - self.ssl_key_pass = None | |
355 | + ssl_key_pass = None | |
356 | ||
357 | password to access the ssl_key | |
358 | ||
359 | - self.size = None | |
360 | + size = None | |
361 | ||
362 | size (in bytes) or Maximum size of the thing being downloaded. | |
363 | This is mostly to keep us from exploding with an endless datastream | |
364 | ||
365 | - self.max_header_size = 2097152 | |
366 | + max_header_size = 2097152 | |
f79f24fe MT |
367 | |
368 | Maximum size (in bytes) of the headers. | |
369 | ||
4fa42d3f | 370 | + ip_resolve = 'whatever' |
f79f24fe MT |
371 | + |
372 | + What type of name to IP resolving to use, default is to do both IPV4 and | |
373 | + IPV6. | |
4fa42d3f MT |
374 | + |
375 | + async = (key, limit) | |
376 | + | |
377 | + When this option is set, the urlgrab() is not processed immediately | |
378 | + but queued. parallel_wait() then processes grabs in parallel, limiting | |
379 | + the numer of connections in each 'key' group to at most 'limit'. | |
380 | + | |
381 | + max_connections | |
382 | + | |
383 | + The global connection limit. | |
384 | + | |
385 | + timedhosts | |
386 | + | |
387 | + The filename of the host download statistics. If defined, urlgrabber | |
388 | + will update the stats at the end of every download. At the end of | |
389 | + parallel_wait(), the updated stats are saved. If synchronous grabs | |
390 | + are used, you should call th_save(). | |
391 | + | |
392 | + default_speed, half_life | |
393 | + | |
394 | + These options only affect the async mirror selection code. | |
395 | + The default_speed option sets the speed estimate for mirrors | |
396 | + we have never downloaded from, and defaults to 1 MBps. | |
397 | + | |
398 | + The speed estimate also drifts exponentially from the speed | |
399 | + actually measured to the default speed, with default | |
400 | + period of 30 days. | |
f79f24fe MT |
401 | + |
402 | ||
403 | RETRY RELATED ARGUMENTS | |
404 | ||
4fa42d3f MT |
405 | @@ -328,6 +385,15 @@ RETRY RELATED ARGUMENTS |
406 | but it cannot (without severe trickiness) prevent the exception | |
407 | from being raised. | |
408 | ||
409 | + failfunc = None | |
410 | + | |
411 | + The callback that gets called when urlgrab request fails. | |
412 | + If defined, urlgrab() calls it instead of raising URLGrabError. | |
413 | + Callback syntax is identical to failure_callback. | |
414 | + | |
415 | + Contrary to failure_callback, it's called only once. It's primary | |
416 | + purpose is to use urlgrab() without a try/except block. | |
417 | + | |
418 | interrupt_callback = None | |
419 | ||
420 | This callback is called if KeyboardInterrupt is received at any | |
421 | @@ -420,6 +486,7 @@ import time | |
f79f24fe MT |
422 | import string |
423 | import urllib | |
424 | import urllib2 | |
425 | +from httplib import responses | |
426 | import mimetools | |
427 | import thread | |
428 | import types | |
4fa42d3f MT |
429 | @@ -428,9 +495,17 @@ import pycurl |
430 | from ftplib import parse150 | |
431 | from StringIO import StringIO | |
432 | from httplib import HTTPException | |
433 | -import socket | |
434 | +import socket, select, fcntl | |
435 | from byterange import range_tuple_normalize, range_tuple_to_header, RangeError | |
436 | ||
437 | +try: | |
438 | + import xattr | |
439 | + if not hasattr(xattr, 'set'): | |
440 | + xattr = None # This is a "newer" API. | |
441 | +except ImportError: | |
442 | + xattr = None | |
443 | + | |
444 | + | |
445 | ######################################################################## | |
446 | # MODULE INITIALIZATION | |
447 | ######################################################################## | |
448 | @@ -439,6 +514,12 @@ try: | |
f79f24fe MT |
449 | except: |
450 | __version__ = '???' | |
451 | ||
452 | +try: | |
453 | + # this part isn't going to do much - need to talk to gettext | |
454 | + from i18n import _ | |
455 | +except ImportError, msg: | |
456 | + def _(st): return st | |
457 | + | |
458 | ######################################################################## | |
459 | # functions for debugging output. These functions are here because they | |
460 | # are also part of the module initialization. | |
4fa42d3f MT |
461 | @@ -504,6 +585,7 @@ def _init_default_logger(logspec=None): |
462 | else: handler = logging.FileHandler(filename) | |
463 | handler.setFormatter(formatter) | |
464 | DBOBJ = logging.getLogger('urlgrabber') | |
465 | + DBOBJ.propagate = False | |
466 | DBOBJ.addHandler(handler) | |
467 | DBOBJ.setLevel(level) | |
468 | except (KeyError, ImportError, ValueError): | |
469 | @@ -512,8 +594,8 @@ def _init_default_logger(logspec=None): | |
470 | ||
471 | def _log_package_state(): | |
472 | if not DEBUG: return | |
473 | - DEBUG.info('urlgrabber version = %s' % __version__) | |
474 | - DEBUG.info('trans function "_" = %s' % _) | |
475 | + DEBUG.debug('urlgrabber version = %s' % __version__) | |
476 | + DEBUG.debug('trans function "_" = %s' % _) | |
477 | ||
478 | _init_default_logger() | |
479 | _log_package_state() | |
480 | @@ -527,6 +609,29 @@ def _(st): | |
f79f24fe MT |
481 | # END MODULE INITIALIZATION |
482 | ######################################################################## | |
483 | ||
484 | +######################################################################## | |
485 | +# UTILITY FUNCTIONS | |
486 | +######################################################################## | |
487 | + | |
488 | +# These functions are meant to be utilities for the urlgrabber library to use. | |
489 | + | |
490 | +def _to_utf8(obj, errors='replace'): | |
491 | + '''convert 'unicode' to an encoded utf-8 byte string ''' | |
492 | + # stolen from yum.i18n | |
493 | + if isinstance(obj, unicode): | |
494 | + obj = obj.encode('utf-8', errors) | |
495 | + return obj | |
496 | + | |
4fa42d3f MT |
497 | +def exception2msg(e): |
498 | + try: | |
499 | + return str(e) | |
500 | + except UnicodeEncodeError: | |
501 | + # always use byte strings | |
502 | + return unicode(e).encode('utf8') | |
503 | + | |
f79f24fe MT |
504 | +######################################################################## |
505 | +# END UTILITY FUNCTIONS | |
506 | +######################################################################## | |
507 | ||
508 | ||
509 | class URLGrabError(IOError): | |
4fa42d3f | 510 | @@ -662,6 +767,7 @@ class URLParser: |
f79f24fe MT |
511 | opts.quote = 0 --> do not quote it |
512 | opts.quote = None --> guess | |
513 | """ | |
514 | + url = _to_utf8(url) | |
515 | quote = opts.quote | |
516 | ||
517 | if opts.prefix: | |
4fa42d3f MT |
518 | @@ -768,6 +874,41 @@ class URLGrabberOptions: |
519 | else: # throttle is a float | |
520 | return self.bandwidth * self.throttle | |
521 | ||
522 | + def find_proxy(self, url, scheme): | |
523 | + """Find the proxy to use for this URL. | |
524 | + Use the proxies dictionary first, then libproxy. | |
525 | + """ | |
526 | + self.proxy = None | |
527 | + if scheme not in ('ftp', 'http', 'https'): | |
528 | + return | |
529 | + | |
530 | + if self.proxies: | |
531 | + proxy = self.proxies.get(scheme) | |
532 | + if proxy is None: | |
533 | + if scheme == 'http': | |
534 | + proxy = self.proxies.get('https') | |
535 | + elif scheme == 'https': | |
536 | + proxy = self.proxies.get('http') | |
537 | + if proxy == '_none_': | |
538 | + proxy = '' | |
539 | + self.proxy = proxy | |
540 | + return | |
541 | + | |
542 | + if self.libproxy: | |
543 | + global _libproxy_cache | |
544 | + if _libproxy_cache is None: | |
545 | + try: | |
546 | + import libproxy | |
547 | + _libproxy_cache = libproxy.ProxyFactory() | |
548 | + except: | |
549 | + _libproxy_cache = False | |
550 | + if _libproxy_cache: | |
551 | + for proxy in _libproxy_cache.getProxies(url): | |
552 | + if proxy.startswith('http://'): | |
553 | + if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url)) | |
554 | + self.proxy = proxy | |
555 | + break | |
556 | + | |
557 | def derive(self, **kwargs): | |
558 | """Create a derived URLGrabberOptions instance. | |
559 | This method creates a new instance and overrides the | |
560 | @@ -791,30 +932,37 @@ class URLGrabberOptions: | |
561 | provided here. | |
562 | """ | |
563 | self.progress_obj = None | |
564 | + self.multi_progress_obj = None | |
565 | self.throttle = 1.0 | |
566 | self.bandwidth = 0 | |
567 | self.retry = None | |
568 | self.retrycodes = [-1,2,4,5,6,7] | |
569 | self.checkfunc = None | |
570 | + self.failfunc = _do_raise | |
571 | self.copy_local = 0 | |
f79f24fe MT |
572 | self.close_connection = 0 |
573 | self.range = None | |
574 | self.user_agent = 'urlgrabber/%s' % __version__ | |
575 | + self.ip_resolve = None | |
576 | self.keepalive = 1 | |
577 | self.proxies = None | |
4fa42d3f MT |
578 | + self.libproxy = False |
579 | + self.proxy = None | |
f79f24fe | 580 | self.reget = None |
4fa42d3f MT |
581 | self.failure_callback = None |
582 | self.interrupt_callback = None | |
f79f24fe MT |
583 | self.prefix = None |
584 | self.opener = None | |
585 | self.cache_openers = True | |
586 | - self.timeout = None | |
587 | + self.timeout = 300 | |
588 | self.text = None | |
589 | self.http_headers = None | |
590 | self.ftp_headers = None | |
591 | self.data = None | |
592 | self.urlparser = URLParser() | |
593 | self.quote = None | |
594 | + self.username = None | |
595 | + self.password = None | |
596 | self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb | |
597 | self.ssl_context = None # no-op in pycurl | |
598 | self.ssl_verify_peer = True # check peer's cert for authenticityb | |
4fa42d3f MT |
599 | @@ -827,6 +975,12 @@ class URLGrabberOptions: |
600 | self.size = None # if we know how big the thing we're getting is going | |
601 | # to be. this is ultimately a MAXIMUM size for the file | |
602 | self.max_header_size = 2097152 #2mb seems reasonable for maximum header size | |
603 | + self.async = None # blocking by default | |
604 | + self.mirror_group = None | |
605 | + self.max_connections = 5 | |
606 | + self.timedhosts = None | |
607 | + self.half_life = 30*24*60*60 # 30 days | |
608 | + self.default_speed = 1e6 # 1 MBit | |
609 | ||
610 | def __repr__(self): | |
611 | return self.format() | |
612 | @@ -846,7 +1000,18 @@ class URLGrabberOptions: | |
f79f24fe MT |
613 | s = s + indent + '}' |
614 | return s | |
615 | ||
616 | -class URLGrabber: | |
4fa42d3f MT |
617 | +def _do_raise(obj): |
618 | + raise obj.exception | |
619 | + | |
620 | +def _run_callback(cb, obj): | |
621 | + if not cb: | |
622 | + return | |
623 | + if callable(cb): | |
624 | + return cb(obj) | |
625 | + cb, arg, karg = cb | |
626 | + return cb(obj, *arg, **karg) | |
627 | + | |
f79f24fe MT |
628 | +class URLGrabber(object): |
629 | """Provides easy opening of URLs with a variety of options. | |
630 | ||
631 | All options are specified as kwargs. Options may be specified when | |
4fa42d3f MT |
632 | @@ -872,7 +1037,6 @@ class URLGrabber: |
633 | # beware of infinite loops :) | |
634 | tries = tries + 1 | |
635 | exception = None | |
636 | - retrycode = None | |
637 | callback = None | |
638 | if DEBUG: DEBUG.info('attempt %i/%s: %s', | |
639 | tries, opts.retry, args[0]) | |
640 | @@ -883,54 +1047,62 @@ class URLGrabber: | |
641 | except URLGrabError, e: | |
642 | exception = e | |
643 | callback = opts.failure_callback | |
644 | - retrycode = e.errno | |
645 | except KeyboardInterrupt, e: | |
646 | exception = e | |
647 | callback = opts.interrupt_callback | |
648 | + if not callback: | |
649 | + raise | |
650 | ||
651 | if DEBUG: DEBUG.info('exception: %s', exception) | |
652 | if callback: | |
653 | if DEBUG: DEBUG.info('calling callback: %s', callback) | |
654 | - cb_func, cb_args, cb_kwargs = self._make_callback(callback) | |
655 | obj = CallbackObject(exception=exception, url=args[0], | |
656 | tries=tries, retry=opts.retry) | |
657 | - cb_func(obj, *cb_args, **cb_kwargs) | |
658 | + _run_callback(callback, obj) | |
659 | ||
660 | if (opts.retry is None) or (tries == opts.retry): | |
661 | if DEBUG: DEBUG.info('retries exceeded, re-raising') | |
662 | raise | |
663 | ||
664 | + retrycode = getattr(exception, 'errno', None) | |
665 | if (retrycode is not None) and (retrycode not in opts.retrycodes): | |
666 | if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', | |
667 | retrycode, opts.retrycodes) | |
668 | raise | |
669 | ||
670 | - def urlopen(self, url, **kwargs): | |
671 | + def urlopen(self, url, opts=None, **kwargs): | |
672 | """open the url and return a file object | |
673 | If a progress object or throttle value specified when this | |
674 | object was created, then a special file object will be | |
675 | returned that supports them. The file object can be treated | |
676 | like any other file object. | |
677 | """ | |
678 | - opts = self.opts.derive(**kwargs) | |
679 | + url = _to_utf8(url) | |
680 | + opts = (opts or self.opts).derive(**kwargs) | |
681 | if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) | |
682 | (url,parts) = opts.urlparser.parse(url, opts) | |
683 | + opts.find_proxy(url, parts[0]) | |
684 | def retryfunc(opts, url): | |
685 | return PyCurlFileObject(url, filename=None, opts=opts) | |
686 | return self._retry(opts, retryfunc, url) | |
687 | ||
688 | - def urlgrab(self, url, filename=None, **kwargs): | |
689 | + def urlgrab(self, url, filename=None, opts=None, **kwargs): | |
690 | """grab the file at <url> and make a local copy at <filename> | |
691 | If filename is none, the basename of the url is used. | |
692 | urlgrab returns the filename of the local file, which may be | |
693 | different from the passed-in filename if copy_local == 0. | |
694 | """ | |
695 | - opts = self.opts.derive(**kwargs) | |
696 | + url = _to_utf8(url) | |
697 | + opts = (opts or self.opts).derive(**kwargs) | |
698 | if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) | |
699 | (url,parts) = opts.urlparser.parse(url, opts) | |
f79f24fe | 700 | (scheme, host, path, parm, query, frag) = parts |
4fa42d3f | 701 | + opts.find_proxy(url, scheme) |
f79f24fe MT |
702 | if filename is None: |
703 | filename = os.path.basename( urllib.unquote(path) ) | |
704 | + if not filename: | |
705 | + # This is better than nothing. | |
706 | + filename = 'index.html' | |
707 | if scheme == 'file' and not opts.copy_local: | |
708 | # just return the name of the local file - don't make a | |
709 | # copy currently | |
4fa42d3f MT |
710 | @@ -950,41 +1122,51 @@ class URLGrabber: |
711 | ||
712 | elif not opts.range: | |
713 | if not opts.checkfunc is None: | |
714 | - cb_func, cb_args, cb_kwargs = \ | |
715 | - self._make_callback(opts.checkfunc) | |
716 | - obj = CallbackObject() | |
717 | - obj.filename = path | |
718 | - obj.url = url | |
719 | - apply(cb_func, (obj, )+cb_args, cb_kwargs) | |
720 | + obj = CallbackObject(filename=path, url=url) | |
721 | + _run_callback(opts.checkfunc, obj) | |
722 | return path | |
723 | ||
724 | + if opts.async: | |
725 | + opts.url = url | |
726 | + opts.filename = filename | |
727 | + opts.size = int(opts.size or 0) | |
728 | + _async_queue.append(opts) | |
729 | + return filename | |
730 | + | |
731 | def retryfunc(opts, url, filename): | |
732 | fo = PyCurlFileObject(url, filename, opts) | |
733 | try: | |
734 | fo._do_grab() | |
735 | + if fo._tm_last: | |
736 | + dlsz = fo._tm_last[0] - fo._tm_first[0] | |
737 | + dltm = fo._tm_last[1] - fo._tm_first[1] | |
738 | + _TH.update(url, dlsz, dltm, None) | |
739 | if not opts.checkfunc is None: | |
740 | - cb_func, cb_args, cb_kwargs = \ | |
741 | - self._make_callback(opts.checkfunc) | |
742 | - obj = CallbackObject() | |
743 | - obj.filename = filename | |
744 | - obj.url = url | |
745 | - apply(cb_func, (obj, )+cb_args, cb_kwargs) | |
746 | + obj = CallbackObject(filename=filename, url=url) | |
747 | + _run_callback(opts.checkfunc, obj) | |
748 | finally: | |
749 | fo.close() | |
750 | return filename | |
751 | ||
752 | - return self._retry(opts, retryfunc, url, filename) | |
753 | + try: | |
754 | + return self._retry(opts, retryfunc, url, filename) | |
755 | + except URLGrabError, e: | |
756 | + _TH.update(url, 0, 0, e) | |
757 | + opts.exception = e | |
758 | + return _run_callback(opts.failfunc, opts) | |
759 | ||
760 | - def urlread(self, url, limit=None, **kwargs): | |
761 | + def urlread(self, url, limit=None, opts=None, **kwargs): | |
762 | """read the url into a string, up to 'limit' bytes | |
763 | If the limit is exceeded, an exception will be thrown. Note | |
764 | that urlread is NOT intended to be used as a way of saying | |
765 | "I want the first N bytes" but rather 'read the whole file | |
766 | into memory, but don't use too much' | |
767 | """ | |
768 | - opts = self.opts.derive(**kwargs) | |
769 | + url = _to_utf8(url) | |
770 | + opts = (opts or self.opts).derive(**kwargs) | |
771 | if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) | |
772 | (url,parts) = opts.urlparser.parse(url, opts) | |
773 | + opts.find_proxy(url, parts[0]) | |
774 | if limit is not None: | |
775 | limit = limit + 1 | |
776 | ||
777 | @@ -1000,12 +1182,8 @@ class URLGrabber: | |
778 | else: s = fo.read(limit) | |
779 | ||
780 | if not opts.checkfunc is None: | |
781 | - cb_func, cb_args, cb_kwargs = \ | |
782 | - self._make_callback(opts.checkfunc) | |
783 | - obj = CallbackObject() | |
784 | - obj.data = s | |
785 | - obj.url = url | |
786 | - apply(cb_func, (obj, )+cb_args, cb_kwargs) | |
787 | + obj = CallbackObject(data=s, url=url) | |
788 | + _run_callback(opts.checkfunc, obj) | |
789 | finally: | |
790 | fo.close() | |
791 | return s | |
792 | @@ -1020,6 +1198,7 @@ class URLGrabber: | |
793 | return s | |
794 | ||
795 | def _make_callback(self, callback_obj): | |
796 | + # not used, left for compatibility | |
797 | if callable(callback_obj): | |
798 | return callback_obj, (), {} | |
799 | else: | |
800 | @@ -1030,7 +1209,7 @@ class URLGrabber: | |
f79f24fe MT |
801 | default_grabber = URLGrabber() |
802 | ||
803 | ||
804 | -class PyCurlFileObject(): | |
805 | +class PyCurlFileObject(object): | |
806 | def __init__(self, url, filename, opts): | |
807 | self.fo = None | |
808 | self._hdr_dump = '' | |
4fa42d3f | 809 | @@ -1052,10 +1231,13 @@ class PyCurlFileObject(): |
f79f24fe MT |
810 | self._reget_length = 0 |
811 | self._prog_running = False | |
812 | self._error = (None, None) | |
813 | - self.size = None | |
814 | + self.size = 0 | |
815 | + self._hdr_ended = False | |
4fa42d3f MT |
816 | + self._tm_first = None |
817 | + self._tm_last = None | |
f79f24fe MT |
818 | self._do_open() |
819 | ||
4fa42d3f | 820 | - |
f79f24fe | 821 | + |
f79f24fe MT |
822 | def __getattr__(self, name): |
823 | """This effectively allows us to wrap at the instance level. | |
4fa42d3f MT |
824 | Any attribute not found in _this_ object will be searched for |
825 | @@ -1067,6 +1249,12 @@ class PyCurlFileObject(): | |
826 | ||
827 | def _retrieve(self, buf): | |
828 | try: | |
829 | + tm = self._amount_read + len(buf), time.time() | |
830 | + if self._tm_first is None: | |
831 | + self._tm_first = tm | |
832 | + else: | |
833 | + self._tm_last = tm | |
834 | + | |
835 | if not self._prog_running: | |
836 | if self.opts.progress_obj: | |
837 | size = self.size + self._reget_length | |
838 | @@ -1079,15 +1267,24 @@ class PyCurlFileObject(): | |
839 | self.opts.progress_obj.update(self._amount_read) | |
840 | ||
841 | self._amount_read += len(buf) | |
842 | - self.fo.write(buf) | |
843 | + try: | |
844 | + self.fo.write(buf) | |
845 | + except IOError, e: | |
846 | + self._cb_error = URLGrabError(16, exception2msg(e)) | |
847 | + return -1 | |
848 | return len(buf) | |
849 | except KeyboardInterrupt: | |
f79f24fe MT |
850 | return -1 |
851 | ||
852 | def _hdr_retrieve(self, buf): | |
853 | + if self._hdr_ended: | |
854 | + self._hdr_dump = '' | |
855 | + self.size = 0 | |
856 | + self._hdr_ended = False | |
857 | + | |
858 | if self._over_max_size(cur=len(self._hdr_dump), | |
859 | max_size=self.opts.max_header_size): | |
860 | - return -1 | |
861 | + return -1 | |
862 | try: | |
863 | self._hdr_dump += buf | |
864 | # we have to get the size before we do the progress obj start | |
4fa42d3f | 865 | @@ -1104,7 +1301,17 @@ class PyCurlFileObject(): |
f79f24fe MT |
866 | s = parse150(buf) |
867 | if s: | |
868 | self.size = int(s) | |
869 | - | |
870 | + | |
871 | + if buf.lower().find('location') != -1: | |
872 | + location = ':'.join(buf.split(':')[1:]) | |
873 | + location = location.strip() | |
874 | + self.scheme = urlparse.urlsplit(location)[0] | |
875 | + self.url = location | |
876 | + | |
877 | + if len(self._hdr_dump) != 0 and buf == '\r\n': | |
878 | + self._hdr_ended = True | |
4fa42d3f | 879 | + if DEBUG: DEBUG.debug('header ended:') |
f79f24fe MT |
880 | + |
881 | return len(buf) | |
882 | except KeyboardInterrupt: | |
883 | return pycurl.READFUNC_ABORT | |
4fa42d3f | 884 | @@ -1113,8 +1320,10 @@ class PyCurlFileObject(): |
f79f24fe MT |
885 | if self._parsed_hdr: |
886 | return self._parsed_hdr | |
887 | statusend = self._hdr_dump.find('\n') | |
888 | + statusend += 1 # ridiculous as it may seem. | |
889 | hdrfp = StringIO() | |
890 | hdrfp.write(self._hdr_dump[statusend:]) | |
891 | + hdrfp.seek(0) | |
892 | self._parsed_hdr = mimetools.Message(hdrfp) | |
893 | return self._parsed_hdr | |
894 | ||
4fa42d3f MT |
895 | @@ -1127,6 +1336,9 @@ class PyCurlFileObject(): |
896 | if not opts: | |
897 | opts = self.opts | |
898 | ||
899 | + # keepalives | |
900 | + if not opts.keepalive: | |
901 | + self.curl_obj.setopt(pycurl.FORBID_REUSE, 1) | |
902 | ||
903 | # defaults we're always going to set | |
904 | self.curl_obj.setopt(pycurl.NOPROGRESS, False) | |
905 | @@ -1136,11 +1348,21 @@ class PyCurlFileObject(): | |
f79f24fe MT |
906 | self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) |
907 | self.curl_obj.setopt(pycurl.FAILONERROR, True) | |
908 | self.curl_obj.setopt(pycurl.OPT_FILETIME, True) | |
909 | + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) | |
910 | ||
4fa42d3f MT |
911 | - if DEBUG: |
912 | + if DEBUG and DEBUG.level <= 10: | |
f79f24fe MT |
913 | self.curl_obj.setopt(pycurl.VERBOSE, True) |
914 | if opts.user_agent: | |
915 | self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) | |
916 | + if opts.ip_resolve: | |
917 | + # Default is: IPRESOLVE_WHATEVER | |
918 | + ipr = opts.ip_resolve.lower() | |
919 | + if ipr == 'whatever': # Do we need this? | |
920 | + self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER) | |
921 | + if ipr == 'ipv4': | |
922 | + self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) | |
923 | + if ipr == 'ipv6': | |
924 | + self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6) | |
925 | ||
926 | # maybe to be options later | |
927 | self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) | |
4fa42d3f | 928 | @@ -1148,9 +1370,11 @@ class PyCurlFileObject(): |
f79f24fe MT |
929 | |
930 | # timeouts | |
931 | timeout = 300 | |
932 | - if opts.timeout: | |
933 | - timeout = int(opts.timeout) | |
934 | - self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) | |
935 | + if hasattr(opts, 'timeout'): | |
936 | + timeout = int(opts.timeout or 0) | |
937 | + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) | |
4fa42d3f | 938 | + self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1000) |
f79f24fe MT |
939 | + self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) |
940 | ||
941 | # ssl options | |
942 | if self.scheme == 'https': | |
4fa42d3f MT |
943 | @@ -1158,13 +1382,16 @@ class PyCurlFileObject(): |
944 | self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) | |
945 | self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) | |
946 | self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) | |
947 | - self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host) | |
948 | + if opts.ssl_verify_host: # 1 is meaningless to curl | |
949 | + self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2) | |
950 | if opts.ssl_key: | |
951 | self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key) | |
952 | if opts.ssl_key_type: | |
953 | self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type) | |
954 | if opts.ssl_cert: | |
955 | self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert) | |
956 | + # if we have a client side cert - turn off reuse b/c nss is odd | |
957 | + self.curl_obj.setopt(pycurl.FORBID_REUSE, 1) | |
958 | if opts.ssl_cert_type: | |
959 | self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) | |
960 | if opts.ssl_key_pass: | |
961 | @@ -1187,28 +1414,26 @@ class PyCurlFileObject(): | |
962 | if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): | |
963 | self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) | |
f79f24fe | 964 | |
4fa42d3f MT |
965 | - # proxy settings |
966 | - if opts.proxies: | |
967 | - for (scheme, proxy) in opts.proxies.items(): | |
968 | - if self.scheme in ('ftp'): # only set the ftp proxy for ftp items | |
969 | - if scheme not in ('ftp'): | |
970 | - continue | |
971 | - else: | |
972 | - if proxy == '_none_': proxy = "" | |
973 | - self.curl_obj.setopt(pycurl.PROXY, proxy) | |
974 | - elif self.scheme in ('http', 'https'): | |
975 | - if scheme not in ('http', 'https'): | |
976 | - continue | |
977 | - else: | |
978 | - if proxy == '_none_': proxy = "" | |
979 | - self.curl_obj.setopt(pycurl.PROXY, proxy) | |
980 | - | |
f79f24fe | 981 | - # FIXME username/password/auth settings |
4fa42d3f MT |
982 | + # proxy |
983 | + if opts.proxy is not None: | |
984 | + self.curl_obj.setopt(pycurl.PROXY, opts.proxy) | |
985 | + self.curl_obj.setopt(pycurl.PROXYAUTH, | |
986 | + # All but Kerberos. BZ 769254 | |
987 | + pycurl.HTTPAUTH_ANY - pycurl.HTTPAUTH_GSSNEGOTIATE) | |
988 | + | |
f79f24fe MT |
989 | + if opts.username and opts.password: |
990 | + if self.scheme in ('http', 'https'): | |
991 | + self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY) | |
992 | + | |
993 | + if opts.username and opts.password: | |
994 | + # apparently when applying them as curlopts they do not require quoting of any kind | |
995 | + userpwd = '%s:%s' % (opts.username, opts.password) | |
996 | + self.curl_obj.setopt(pycurl.USERPWD, userpwd) | |
997 | ||
998 | #posts - simple - expects the fields as they are | |
999 | if opts.data: | |
1000 | self.curl_obj.setopt(pycurl.POST, True) | |
1001 | - self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) | |
1002 | + self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data)) | |
1003 | ||
1004 | # our url | |
1005 | self.curl_obj.setopt(pycurl.URL, self.url) | |
4fa42d3f | 1006 | @@ -1228,39 +1453,36 @@ class PyCurlFileObject(): |
f79f24fe MT |
1007 | |
1008 | code = self.http_code | |
1009 | errcode = e.args[0] | |
1010 | + errurl = urllib.unquote(self.url) | |
1011 | + | |
1012 | if self._error[0]: | |
1013 | errcode = self._error[0] | |
1014 | ||
4fa42d3f | 1015 | - if errcode == 23 and code >= 200 and code < 299: |
f79f24fe MT |
1016 | - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) |
1017 | - err.url = self.url | |
4fa42d3f MT |
1018 | - |
1019 | + if errcode == 23 and 200 <= code <= 299: | |
f79f24fe MT |
1020 | # this is probably wrong but ultimately this is what happens |
1021 | # we have a legit http code and a pycurl 'writer failed' code | |
4fa42d3f MT |
1022 | # which almost always means something aborted it from outside |
1023 | # since we cannot know what it is -I'm banking on it being | |
1024 | # a ctrl-c. XXXX - if there's a way of going back two raises to | |
1025 | # figure out what aborted the pycurl process FIXME | |
1026 | - raise KeyboardInterrupt | |
1027 | + raise getattr(self, '_cb_error', KeyboardInterrupt) | |
f79f24fe MT |
1028 | |
1029 | elif errcode == 28: | |
1030 | - err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) | |
1031 | - err.url = self.url | |
1032 | + err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e)) | |
1033 | + err.url = errurl | |
1034 | raise err | |
1035 | elif errcode == 35: | |
1036 | msg = _("problem making ssl connection") | |
1037 | err = URLGrabError(14, msg) | |
1038 | - err.url = self.url | |
1039 | + err.url = errurl | |
1040 | raise err | |
1041 | elif errcode == 37: | |
1042 | - msg = _("Could not open/read %s") % (self.url) | |
1043 | + msg = _("Could not open/read %s") % (errurl) | |
1044 | err = URLGrabError(14, msg) | |
1045 | - err.url = self.url | |
1046 | + err.url = errurl | |
1047 | raise err | |
1048 | ||
1049 | elif errcode == 42: | |
1050 | - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) | |
1051 | - err.url = self.url | |
f79f24fe MT |
1052 | # this is probably wrong but ultimately this is what happens |
1053 | # we have a legit http code and a pycurl 'writer failed' code | |
1054 | # which almost always means something aborted it from outside | |
4fa42d3f | 1055 | @@ -1272,33 +1494,94 @@ class PyCurlFileObject(): |
f79f24fe MT |
1056 | elif errcode == 58: |
1057 | msg = _("problem with the local client certificate") | |
1058 | err = URLGrabError(14, msg) | |
1059 | - err.url = self.url | |
1060 | + err.url = errurl | |
1061 | raise err | |
1062 | ||
1063 | elif errcode == 60: | |
1064 | - msg = _("client cert cannot be verified or client cert incorrect") | |
1065 | + msg = _("Peer cert cannot be verified or peer cert invalid") | |
1066 | err = URLGrabError(14, msg) | |
1067 | - err.url = self.url | |
1068 | + err.url = errurl | |
1069 | raise err | |
1070 | ||
1071 | elif errcode == 63: | |
1072 | if self._error[1]: | |
1073 | msg = self._error[1] | |
1074 | else: | |
1075 | - msg = _("Max download size exceeded on %s") % (self.url) | |
1076 | + msg = _("Max download size exceeded on %s") % () | |
1077 | err = URLGrabError(14, msg) | |
1078 | - err.url = self.url | |
1079 | + err.url = errurl | |
1080 | raise err | |
1081 | ||
4fa42d3f | 1082 | - elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it |
f79f24fe | 1083 | - msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) |
4fa42d3f | 1084 | + elif str(e.args[1]) == '' and code and not 200 <= code <= 299: |
f79f24fe MT |
1085 | + if self.scheme in ['http', 'https']: |
1086 | + if self.http_code in responses: | |
1087 | + resp = responses[self.http_code] | |
1088 | + msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl) | |
1089 | + else: | |
1090 | + msg = 'HTTP Error %s : %s ' % (self.http_code, errurl) | |
1091 | + elif self.scheme in ['ftp']: | |
1092 | + msg = 'FTP Error %s : %s ' % (self.http_code, errurl) | |
1093 | + else: | |
1094 | + msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme) | |
1095 | else: | |
1096 | - msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) | |
1097 | + pyerr2str = { 5 : _("Couldn't resolve proxy"), | |
1098 | + 6 : _("Couldn't resolve host"), | |
1099 | + 7 : _("Couldn't connect"), | |
1100 | + 8 : _("Bad reply to FTP server"), | |
1101 | + 9 : _("Access denied"), | |
1102 | + 11 : _("Bad reply to FTP pass"), | |
1103 | + 13 : _("Bad reply to FTP pasv"), | |
1104 | + 14 : _("Bad reply to FTP 227"), | |
1105 | + 15 : _("Couldn't get FTP host"), | |
1106 | + 17 : _("Couldn't set FTP type"), | |
1107 | + 18 : _("Partial file"), | |
1108 | + 19 : _("FTP RETR command failed"), | |
1109 | + 22 : _("HTTP returned error"), | |
1110 | + 23 : _("Write error"), | |
1111 | + 25 : _("Upload failed"), | |
1112 | + 26 : _("Read error"), | |
1113 | + 27 : _("Out of Memory"), | |
1114 | + 28 : _("Operation timed out"), | |
1115 | + 30 : _("FTP PORT command failed"), | |
1116 | + 31 : _("FTP REST command failed"), | |
1117 | + 33 : _("Range failed"), | |
1118 | + 34 : _("HTTP POST failed"), | |
1119 | + 35 : _("SSL CONNECT failed"), | |
1120 | + 36 : _("Couldn't resume download"), | |
1121 | + 37 : _("Couldn't read file"), | |
1122 | + 42 : _("Aborted by callback"), | |
1123 | + 47 : _("Too many redirects"), | |
1124 | + 51 : _("Peer certificate failed verification"), | |
4fa42d3f | 1125 | + 52 : _("Got nothing: SSL certificate expired?"), |
f79f24fe MT |
1126 | + 53 : _("SSL engine not found"), |
1127 | + 54 : _("SSL engine set failed"), | |
1128 | + 55 : _("Network error send()"), | |
1129 | + 56 : _("Network error recv()"), | |
1130 | + 58 : _("Local certificate failed"), | |
1131 | + 59 : _("SSL set cipher failed"), | |
1132 | + 60 : _("Local CA certificate failed"), | |
1133 | + 61 : _("HTTP bad transfer encoding"), | |
1134 | + 63 : _("Maximum file size exceeded"), | |
1135 | + 64 : _("FTP SSL failed"), | |
1136 | + 67 : _("Authentication failure"), | |
1137 | + 70 : _("Out of disk space on server"), | |
1138 | + 73 : _("Remove file exists"), | |
1139 | + } | |
1140 | + errstr = str(e.args[1]) | |
1141 | + if not errstr: | |
1142 | + errstr = pyerr2str.get(errcode, '<Unknown>') | |
1143 | + msg = 'curl#%s - "%s"' % (errcode, errstr) | |
1144 | code = errcode | |
1145 | err = URLGrabError(14, msg) | |
1146 | err.code = code | |
1147 | err.exception = e | |
1148 | raise err | |
1149 | + else: | |
1150 | + if self._error[1]: | |
1151 | + msg = self._error[1] | |
4fa42d3f | 1152 | + err = URLGrabError(14, msg) |
f79f24fe MT |
1153 | + err.url = urllib.unquote(self.url) |
1154 | + raise err | |
1155 | ||
1156 | def _do_open(self): | |
1157 | self.curl_obj = _curl_cache | |
4fa42d3f | 1158 | @@ -1333,7 +1616,11 @@ class PyCurlFileObject(): |
f79f24fe MT |
1159 | |
1160 | if self.opts.range: | |
1161 | rt = self.opts.range | |
1162 | - if rt[0]: rt = (rt[0] + reget_length, rt[1]) | |
1163 | + | |
1164 | + if rt[0] is None: | |
1165 | + rt = (0, rt[1]) | |
1166 | + rt = (rt[0] + reget_length, rt[1]) | |
1167 | + | |
1168 | ||
1169 | if rt: | |
1170 | header = range_tuple_to_header(rt) | |
4fa42d3f | 1171 | @@ -1434,21 +1721,46 @@ class PyCurlFileObject(): |
f79f24fe MT |
1172 | #fh, self._temp_name = mkstemp() |
1173 | #self.fo = open(self._temp_name, 'wb') | |
1174 | ||
1175 | - | |
1176 | - self._do_perform() | |
1177 | - | |
4fa42d3f MT |
1178 | - |
1179 | - | |
f79f24fe MT |
1180 | + try: |
1181 | + self._do_perform() | |
1182 | + except URLGrabError, e: | |
1183 | + self.fo.flush() | |
1184 | + self.fo.close() | |
1185 | + raise e | |
1186 | + | |
f79f24fe | 1187 | if _was_filename: |
4fa42d3f MT |
1188 | # close it up |
1189 | self.fo.flush() | |
1190 | self.fo.close() | |
1191 | + | |
1192 | + # Set the URL where we got it from: | |
1193 | + if xattr is not None: | |
1194 | + # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes | |
1195 | + try: | |
1196 | + xattr.set(self.filename, 'user.xdg.origin.url', self.url) | |
1197 | + except: | |
1198 | + pass # URL too long. = IOError ... ignore everything. | |
1199 | + | |
f79f24fe MT |
1200 | # set the time |
1201 | mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) | |
1202 | if mod_time != -1: | |
1203 | - os.utime(self.filename, (mod_time, mod_time)) | |
1204 | + try: | |
1205 | + os.utime(self.filename, (mod_time, mod_time)) | |
1206 | + except OSError, e: | |
1207 | + err = URLGrabError(16, _(\ | |
1208 | + 'error setting timestamp on file %s from %s, OSError: %s') | |
1209 | + % (self.filename, self.url, e)) | |
1210 | + err.url = self.url | |
1211 | + raise err | |
1212 | # re open it | |
1213 | - self.fo = open(self.filename, 'r') | |
1214 | + try: | |
1215 | + self.fo = open(self.filename, 'r') | |
1216 | + except IOError, e: | |
1217 | + err = URLGrabError(16, _(\ | |
1218 | + 'error opening file from %s, IOError: %s') % (self.url, e)) | |
1219 | + err.url = self.url | |
1220 | + raise err | |
1221 | + | |
1222 | else: | |
1223 | #self.fo = open(self._temp_name, 'r') | |
1224 | self.fo.seek(0) | |
4fa42d3f MT |
1225 | @@ -1526,17 +1838,20 @@ class PyCurlFileObject(): |
1226 | if self._prog_running: | |
1227 | downloaded += self._reget_length | |
1228 | self.opts.progress_obj.update(downloaded) | |
1229 | - except KeyboardInterrupt: | |
1230 | + except (KeyboardInterrupt, IOError): | |
1231 | return -1 | |
1232 | ||
f79f24fe MT |
1233 | def _over_max_size(self, cur, max_size=None): |
1234 | ||
1235 | if not max_size: | |
1236 | - max_size = self.size | |
1237 | - if self.opts.size: # if we set an opts size use that, no matter what | |
1238 | - max_size = self.opts.size | |
1239 | + if not self.opts.size: | |
1240 | + max_size = self.size | |
1241 | + else: | |
1242 | + max_size = self.opts.size | |
1243 | + | |
1244 | if not max_size: return False # if we have None for all of the Max then this is dumb | |
1245 | - if cur > max_size + max_size*.10: | |
1246 | + | |
1247 | + if cur > int(float(max_size) * 1.10): | |
1248 | ||
1249 | msg = _("Downloaded more than max size for %s: %s > %s") \ | |
1250 | % (self.url, cur, max_size) | |
4fa42d3f | 1251 | @@ -1544,13 +1859,6 @@ class PyCurlFileObject(): |
f79f24fe MT |
1252 | return True |
1253 | return False | |
1254 | ||
1255 | - def _to_utf8(self, obj, errors='replace'): | |
1256 | - '''convert 'unicode' to an encoded utf-8 byte string ''' | |
1257 | - # stolen from yum.i18n | |
1258 | - if isinstance(obj, unicode): | |
1259 | - obj = obj.encode('utf-8', errors) | |
1260 | - return obj | |
1261 | - | |
1262 | def read(self, amt=None): | |
1263 | self._fill_buffer(amt) | |
1264 | if amt is None: | |
4fa42d3f | 1265 | @@ -1582,9 +1890,21 @@ class PyCurlFileObject(): |
f79f24fe MT |
1266 | self.opts.progress_obj.end(self._amount_read) |
1267 | self.fo.close() | |
1268 | ||
1269 | - | |
1270 | + def geturl(self): | |
1271 | + """ Provide the geturl() method, used to be got from | |
1272 | + urllib.addinfourl, via. urllib.URLopener.* """ | |
1273 | + return self.url | |
1274 | + | |
1275 | _curl_cache = pycurl.Curl() # make one and reuse it over and over and over | |
1276 | ||
1277 | +def reset_curl_obj(): | |
1278 | + """To make sure curl has reread the network/dns info we force a reload""" | |
1279 | + global _curl_cache | |
1280 | + _curl_cache.close() | |
1281 | + _curl_cache = pycurl.Curl() | |
1282 | + | |
4fa42d3f | 1283 | +_libproxy_cache = None |
f79f24fe MT |
1284 | + |
1285 | ||
1286 | ##################################################################### | |
1287 | # DEPRECATED FUNCTIONS | |
4fa42d3f MT |
1288 | @@ -1621,6 +1941,478 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0, |
1289 | ||
1290 | ||
1291 | ##################################################################### | |
1292 | +# Serializer + parser: A replacement of the rather bulky Json code. | |
1293 | +# | |
1294 | +# - handles basic python literals, lists and tuples. | |
1295 | +# - serialized strings never contain ' ' or '\n' | |
1296 | +# | |
1297 | +##################################################################### | |
1298 | + | |
1299 | +_quoter_map = {} | |
1300 | +for c in '%[(,)] \n': | |
1301 | + _quoter_map[c] = '%%%02x' % ord(c) | |
1302 | +del c | |
1303 | + | |
1304 | +def _dumps(v): | |
1305 | + if v is None: return 'None' | |
1306 | + if v is True: return 'True' | |
1307 | + if v is False: return 'False' | |
1308 | + if type(v) in (int, long, float): | |
1309 | + return str(v) | |
1310 | + if type(v) == unicode: | |
1311 | + v = v.encode('UTF8') | |
1312 | + if type(v) == str: | |
1313 | + def quoter(c): return _quoter_map.get(c, c) | |
1314 | + return "'%s'" % ''.join(map(quoter, v)) | |
1315 | + if type(v) == tuple: | |
1316 | + return "(%s)" % ','.join(map(_dumps, v)) | |
1317 | + if type(v) == list: | |
1318 | + return "[%s]" % ','.join(map(_dumps, v)) | |
1319 | + raise TypeError, 'Can\'t serialize %s' % v | |
1320 | + | |
1321 | +def _loads(s): | |
1322 | + def decode(v): | |
1323 | + if v == 'None': return None | |
1324 | + if v == 'True': return True | |
1325 | + if v == 'False': return False | |
1326 | + try: return int(v) | |
1327 | + except ValueError: pass | |
1328 | + try: return float(v) | |
1329 | + except ValueError: pass | |
1330 | + if len(v) >= 2 and v[0] == v[-1] == "'": | |
1331 | + ret = []; i = 1 | |
1332 | + while True: | |
1333 | + j = v.find('%', i) | |
1334 | + ret.append(v[i:j]) # skips the final "'" | |
1335 | + if j == -1: break | |
1336 | + ret.append(chr(int(v[j + 1:j + 3], 16))) | |
1337 | + i = j + 3 | |
1338 | + v = ''.join(ret) | |
1339 | + return v | |
1340 | + stk = None | |
1341 | + l = [] | |
1342 | + i = j = 0 | |
1343 | + while True: | |
1344 | + if j == len(s) or s[j] in ',)]': | |
1345 | + if j > i: | |
1346 | + l.append(decode(s[i:j])) | |
1347 | + if j == len(s): break | |
1348 | + if s[j] in ')]': | |
1349 | + if s[j] == ')': | |
1350 | + l = tuple(l) | |
1351 | + stk[0].append(l) | |
1352 | + l, stk = stk | |
1353 | + i = j = j + 1 | |
1354 | + elif s[j] in '[(': | |
1355 | + stk = l, stk | |
1356 | + l = [] | |
1357 | + i = j = j + 1 | |
1358 | + else: | |
1359 | + j += 1 # safe because '[(,)]' are quoted | |
1360 | + if stk: raise ValueError | |
1361 | + if len(l) == 1: l = l[0] | |
1362 | + return l | |
1363 | + | |
1364 | + | |
1365 | +##################################################################### | |
1366 | +# External downloader process | |
1367 | +##################################################################### | |
1368 | + | |
1369 | +def _readlines(fd): | |
1370 | + buf = os.read(fd, 4096) | |
1371 | + if not buf: return None | |
1372 | + # whole lines only, no buffering | |
1373 | + while buf[-1] != '\n': | |
1374 | + buf += os.read(fd, 4096) | |
1375 | + return buf[:-1].split('\n') | |
1376 | + | |
1377 | +import subprocess | |
1378 | + | |
1379 | +class _ExternalDownloader: | |
1380 | + def __init__(self): | |
1381 | + self.popen = subprocess.Popen( | |
1382 | + '/usr/libexec/urlgrabber-ext-down', | |
1383 | + stdin = subprocess.PIPE, | |
1384 | + stdout = subprocess.PIPE, | |
1385 | + ) | |
1386 | + self.stdin = self.popen.stdin.fileno() | |
1387 | + self.stdout = self.popen.stdout.fileno() | |
1388 | + self.running = {} | |
1389 | + self.cnt = 0 | |
1390 | + | |
1391 | + # list of options we pass to downloader | |
1392 | + _options = ( | |
1393 | + 'url', 'filename', | |
1394 | + 'timeout', 'close_connection', 'keepalive', | |
1395 | + 'throttle', 'bandwidth', 'range', 'reget', | |
1396 | + 'user_agent', 'http_headers', 'ftp_headers', | |
1397 | + 'proxy', 'prefix', 'username', 'password', | |
1398 | + 'ssl_ca_cert', | |
1399 | + 'ssl_cert', 'ssl_cert_type', | |
1400 | + 'ssl_key', 'ssl_key_type', | |
1401 | + 'ssl_key_pass', | |
1402 | + 'ssl_verify_peer', 'ssl_verify_host', | |
1403 | + 'size', 'max_header_size', 'ip_resolve', | |
1404 | + ) | |
1405 | + | |
1406 | + def start(self, opts): | |
1407 | + arg = [] | |
1408 | + for k in self._options: | |
1409 | + v = getattr(opts, k) | |
1410 | + if v is None: continue | |
1411 | + arg.append('%s=%s' % (k, _dumps(v))) | |
1412 | + if opts.progress_obj and opts.multi_progress_obj: | |
1413 | + arg.append('progress_obj=True') | |
1414 | + arg = ' '.join(arg) | |
1415 | + if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url) | |
1416 | + | |
1417 | + self.cnt += 1 | |
1418 | + self.running[self.cnt] = opts | |
1419 | + os.write(self.stdin, arg +'\n') | |
1420 | + | |
1421 | + def perform(self): | |
1422 | + ret = [] | |
1423 | + lines = _readlines(self.stdout) | |
1424 | + if not lines: | |
1425 | + if DEBUG: DEBUG.info('downloader died') | |
1426 | + raise KeyboardInterrupt | |
1427 | + for line in lines: | |
1428 | + # parse downloader output | |
1429 | + line = line.split(' ', 5) | |
1430 | + _id, size = map(int, line[:2]) | |
1431 | + if len(line) == 2: | |
1432 | + self.running[_id]._progress.update(size) | |
1433 | + continue | |
1434 | + # job done | |
1435 | + opts = self.running.pop(_id) | |
1436 | + if line[4] == 'OK': | |
1437 | + ug_err = None | |
1438 | + if DEBUG: DEBUG.info('success') | |
1439 | + else: | |
1440 | + ug_err = URLGrabError(int(line[4]), line[5]) | |
1441 | + if DEBUG: DEBUG.info('failure: %s', ug_err) | |
1442 | + _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async[0]) | |
1443 | + ret.append((opts, size, ug_err)) | |
1444 | + return ret | |
1445 | + | |
1446 | + def abort(self): | |
1447 | + self.popen.stdin.close() | |
1448 | + self.popen.stdout.close() | |
1449 | + self.popen.wait() | |
1450 | + | |
1451 | +class _ExternalDownloaderPool: | |
1452 | + def __init__(self): | |
1453 | + self.epoll = select.epoll() | |
1454 | + self.running = {} | |
1455 | + self.cache = {} | |
1456 | + | |
1457 | + def start(self, opts): | |
1458 | + host = urlparse.urlsplit(opts.url).netloc | |
1459 | + dl = self.cache.pop(host, None) | |
1460 | + if not dl: | |
1461 | + dl = _ExternalDownloader() | |
1462 | + fl = fcntl.fcntl(dl.stdin, fcntl.F_GETFD) | |
1463 | + fcntl.fcntl(dl.stdin, fcntl.F_SETFD, fl | fcntl.FD_CLOEXEC) | |
1464 | + self.epoll.register(dl.stdout, select.EPOLLIN) | |
1465 | + self.running[dl.stdout] = dl | |
1466 | + dl.start(opts) | |
1467 | + | |
1468 | + def perform(self): | |
1469 | + ret = [] | |
1470 | + for fd, event in self.epoll.poll(): | |
1471 | + if event & select.EPOLLHUP: | |
1472 | + if DEBUG: DEBUG.info('downloader died') | |
1473 | + raise KeyboardInterrupt | |
1474 | + assert event & select.EPOLLIN | |
1475 | + done = self.running[fd].perform() | |
1476 | + if not done: continue | |
1477 | + assert len(done) == 1 | |
1478 | + ret.extend(done) | |
1479 | + | |
1480 | + # dl finished, move it to the cache | |
1481 | + host = urlparse.urlsplit(done[0][0].url).netloc | |
1482 | + if host in self.cache: self.cache[host].abort() | |
1483 | + self.epoll.unregister(fd) | |
1484 | + self.cache[host] = self.running.pop(fd) | |
1485 | + return ret | |
1486 | + | |
1487 | + def abort(self): | |
1488 | + for dl in self.running.values(): | |
1489 | + self.epoll.unregister(dl.stdout) | |
1490 | + dl.abort() | |
1491 | + for dl in self.cache.values(): | |
1492 | + dl.abort() | |
1493 | + | |
1494 | + | |
1495 | +##################################################################### | |
1496 | +# High level async API | |
1497 | +##################################################################### | |
1498 | + | |
1499 | +_async_queue = [] | |
1500 | + | |
1501 | +def parallel_wait(meter=None): | |
1502 | + '''Process queued requests in parallel. | |
1503 | + ''' | |
1504 | + | |
1505 | + # calculate total sizes | |
1506 | + meters = {} | |
1507 | + for opts in _async_queue: | |
1508 | + if opts.progress_obj and opts.multi_progress_obj: | |
1509 | + count, total = meters.get(opts.multi_progress_obj) or (0, 0) | |
1510 | + meters[opts.multi_progress_obj] = count + 1, total + opts.size | |
1511 | + | |
1512 | + # start multi-file meters | |
1513 | + for meter in meters: | |
1514 | + count, total = meters[meter] | |
1515 | + meter.start(count, total) | |
1516 | + | |
1517 | + dl = _ExternalDownloaderPool() | |
1518 | + host_con = {} # current host connection counts | |
1519 | + | |
1520 | + def start(opts, tries): | |
1521 | + opts.tries = tries | |
1522 | + try: | |
1523 | + dl.start(opts) | |
1524 | + except OSError, e: | |
1525 | + # can't spawn downloader, give up immediately | |
1526 | + opts.exception = URLGrabError(5, exception2msg(e)) | |
1527 | + _run_callback(opts.failfunc, opts) | |
1528 | + return | |
1529 | + | |
1530 | + key, limit = opts.async | |
1531 | + host_con[key] = host_con.get(key, 0) + 1 | |
1532 | + if opts.progress_obj: | |
1533 | + if opts.multi_progress_obj: | |
1534 | + opts._progress = opts.multi_progress_obj.newMeter() | |
1535 | + opts._progress.start(text=opts.text) | |
1536 | + else: | |
1537 | + opts._progress = time.time() # no updates | |
1538 | + | |
1539 | + def perform(): | |
1540 | + for opts, size, ug_err in dl.perform(): | |
1541 | + key, limit = opts.async | |
1542 | + host_con[key] -= 1 | |
1543 | + | |
1544 | + if ug_err is None: | |
1545 | + if opts.checkfunc: | |
1546 | + try: _run_callback(opts.checkfunc, opts) | |
1547 | + except URLGrabError, ug_err: pass | |
1548 | + | |
1549 | + if opts.progress_obj: | |
1550 | + if opts.multi_progress_obj: | |
1551 | + if ug_err: | |
1552 | + opts._progress.failure(None) | |
1553 | + else: | |
1554 | + opts.multi_progress_obj.re.total += size - opts.size # correct totals | |
1555 | + opts._progress.end(size) | |
1556 | + opts.multi_progress_obj.removeMeter(opts._progress) | |
1557 | + else: | |
1558 | + opts.progress_obj.start(text=opts.text, now=opts._progress) | |
1559 | + opts.progress_obj.update(size) | |
1560 | + opts.progress_obj.end(size) | |
1561 | + del opts._progress | |
1562 | + | |
1563 | + if ug_err is None: | |
1564 | + continue | |
1565 | + | |
1566 | + retry = opts.retry or 0 | |
1567 | + if opts.failure_callback: | |
1568 | + opts.exception = ug_err | |
1569 | + try: _run_callback(opts.failure_callback, opts) | |
1570 | + except URLGrabError, ug_err: | |
1571 | + retry = 0 # no retries | |
1572 | + if opts.tries < retry and ug_err.errno in opts.retrycodes: | |
1573 | + start(opts, opts.tries + 1) # simple retry | |
1574 | + continue | |
1575 | + | |
1576 | + if opts.mirror_group: | |
1577 | + mg, errors, failed, removed = opts.mirror_group | |
1578 | + errors.append((opts.url, exception2msg(ug_err))) | |
1579 | + failed[key] = failed.get(key, 0) + 1 | |
1580 | + opts.mirror = key | |
1581 | + opts.exception = ug_err | |
1582 | + action = mg.default_action or {} | |
1583 | + if mg.failure_callback: | |
1584 | + opts.tries = len(errors) | |
1585 | + action = dict(action) # update only the copy | |
1586 | + action.update(_run_callback(mg.failure_callback, opts)) | |
1587 | + if not action.get('fail', 0): | |
1588 | + # mask this mirror and retry | |
1589 | + if action.get('remove', 1): | |
1590 | + removed.add(key) | |
1591 | + _async_queue.append(opts) | |
1592 | + continue | |
1593 | + # fail=1 from callback | |
1594 | + ug_err.errors = errors | |
1595 | + | |
1596 | + # urlgrab failed | |
1597 | + opts.exception = ug_err | |
1598 | + _run_callback(opts.failfunc, opts) | |
1599 | + | |
1600 | + try: | |
1601 | + idx = 0 | |
1602 | + while True: | |
1603 | + if idx >= len(_async_queue): | |
1604 | + # the queue is empty | |
1605 | + if not dl.running: break | |
1606 | + # pending dl may extend it | |
1607 | + perform() | |
1608 | + continue | |
1609 | + | |
1610 | + # handle next request | |
1611 | + opts = _async_queue[idx] | |
1612 | + idx += 1 | |
1613 | + | |
1614 | + # check global limit | |
1615 | + while len(dl.running) >= default_grabber.opts.max_connections: | |
1616 | + perform() | |
1617 | + if DEBUG: | |
1618 | + DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections) | |
1619 | + | |
1620 | + if opts.mirror_group: | |
1621 | + mg, errors, failed, removed = opts.mirror_group | |
1622 | + | |
1623 | + # find the best mirror | |
1624 | + best = None | |
1625 | + best_speed = None | |
1626 | + for mirror in mg.mirrors: | |
1627 | + key = mirror['mirror'] | |
1628 | + if key in removed: continue | |
1629 | + | |
1630 | + # estimate mirror speed | |
1631 | + speed, fail = _TH.estimate(key) | |
1632 | + speed /= 1 + host_con.get(key, 0) | |
1633 | + | |
1634 | + # order by: least failures, private flag, best speed | |
1635 | + # ignore 'private' flag if there were failures | |
1636 | + private = not fail and mirror.get('kwargs', {}).get('private', False) | |
1637 | + speed = -failed.get(key, 0), private, speed | |
1638 | + if best is None or speed > best_speed: | |
1639 | + best = mirror | |
1640 | + best_speed = speed | |
1641 | + | |
1642 | + if best is None: | |
1643 | + opts.exception = URLGrabError(256, _('No more mirrors to try.')) | |
1644 | + opts.exception.errors = errors | |
1645 | + _run_callback(opts.failfunc, opts) | |
1646 | + continue | |
1647 | + | |
1648 | + # update the grabber object, apply mirror kwargs | |
1649 | + grabber = best.get('grabber') or mg.grabber | |
1650 | + opts.delegate = grabber.opts.derive(**best.get('kwargs', {})) | |
1651 | + | |
1652 | + # update the current mirror and limit | |
1653 | + key = best['mirror'] | |
1654 | + limit = best.get('kwargs', {}).get('max_connections', 2) | |
1655 | + opts.async = key, limit | |
1656 | + | |
1657 | + # update URL and proxy | |
1658 | + url = mg._join_url(key, opts.relative_url) | |
1659 | + url, parts = opts.urlparser.parse(url, opts) | |
1660 | + opts.find_proxy(url, parts[0]) | |
1661 | + opts.url = url | |
1662 | + | |
1663 | + # check host limit, then start | |
1664 | + key, limit = opts.async | |
1665 | + while host_con.get(key, 0) >= limit: | |
1666 | + perform() | |
1667 | + if DEBUG: | |
1668 | + DEBUG.info('max_connections(%s): %d/%d', key, host_con.get(key, 0), limit) | |
1669 | + | |
1670 | + start(opts, 1) | |
1671 | + except IOError, e: | |
1672 | + if e.errno != 4: raise | |
1673 | + raise KeyboardInterrupt | |
1674 | + | |
1675 | + finally: | |
1676 | + dl.abort() | |
1677 | + for meter in meters: | |
1678 | + meter.end() | |
1679 | + del _async_queue[:] | |
1680 | + _TH.save() | |
1681 | + | |
1682 | + | |
1683 | +##################################################################### | |
1684 | +# Host bandwidth estimation | |
1685 | +##################################################################### | |
1686 | + | |
1687 | +class _TH: | |
1688 | + hosts = {} | |
1689 | + dirty = None | |
1690 | + | |
1691 | + @staticmethod | |
1692 | + def load(): | |
1693 | + filename = default_grabber.opts.timedhosts | |
1694 | + if filename and _TH.dirty is None: | |
1695 | + try: | |
1696 | + for line in open(filename): | |
1697 | + host, speed, fail, ts = line.rsplit(' ', 3) | |
1698 | + _TH.hosts[host] = int(speed), int(fail), int(ts) | |
1699 | + except IOError: pass | |
1700 | + _TH.dirty = False | |
1701 | + | |
1702 | + @staticmethod | |
1703 | + def save(): | |
1704 | + filename = default_grabber.opts.timedhosts | |
1705 | + if filename and _TH.dirty is True: | |
1706 | + tmp = '%s.%d' % (filename, os.getpid()) | |
1707 | + try: | |
1708 | + f = open(tmp, 'w') | |
1709 | + for host in _TH.hosts: | |
1710 | + f.write(host + ' %d %d %d\n' % _TH.hosts[host]) | |
1711 | + f.close() | |
1712 | + os.rename(tmp, filename) | |
1713 | + except IOError: pass | |
1714 | + _TH.dirty = False | |
1715 | + | |
1716 | + @staticmethod | |
1717 | + def update(url, dl_size, dl_time, ug_err, baseurl=None): | |
1718 | + # Use hostname from URL. If it's a file:// URL, use baseurl. | |
1719 | + # If no baseurl, do not update timedhosts. | |
1720 | + host = urlparse.urlsplit(url).netloc.split('@')[-1] or baseurl | |
1721 | + if not host: return | |
1722 | + | |
1723 | + _TH.load() | |
1724 | + speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0) | |
1725 | + now = time.time() | |
1726 | + | |
1727 | + if ug_err is None: | |
1728 | + # defer first update if the file was small. BZ 851178. | |
1729 | + if not ts and dl_size < 1e6: return | |
1730 | + | |
1731 | + # k1: the older, the less useful | |
1732 | + # k2: <500ms readings are less reliable | |
1733 | + # speeds vary, use 10:1 smoothing | |
1734 | + k1 = 2**((ts - now) / default_grabber.opts.half_life) | |
1735 | + k2 = min(dl_time / .500, 1.0) / 10 | |
1736 | + if k2 > 0: | |
1737 | + speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2) | |
1738 | + fail = 0 | |
1739 | + elif getattr(ug_err, 'code', None) == 404: | |
1740 | + fail = 0 # alive, at least | |
1741 | + else: | |
1742 | + fail += 1 # seems dead | |
1743 | + | |
1744 | + _TH.hosts[host] = speed, fail, now | |
1745 | + _TH.dirty = True | |
1746 | + | |
1747 | + @staticmethod | |
1748 | + def estimate(baseurl): | |
1749 | + _TH.load() | |
1750 | + | |
1751 | + # Use just the hostname, unless it's a file:// baseurl. | |
1752 | + host = urlparse.urlsplit(baseurl).netloc.split('@')[-1] or baseurl | |
1753 | + | |
1754 | + default_speed = default_grabber.opts.default_speed | |
1755 | + try: speed, fail, ts = _TH.hosts[host] | |
1756 | + except KeyError: return default_speed, 0 | |
1757 | + | |
1758 | + speed *= 2**-fail | |
1759 | + k = 2**((ts - time.time()) / default_grabber.opts.half_life) | |
1760 | + speed = k * speed + (1 - k) * default_speed | |
1761 | + return speed, fail | |
1762 | + | |
1763 | +##################################################################### | |
1764 | # TESTING | |
1765 | def _main_test(): | |
1766 | try: url, filename = sys.argv[1:3] | |
f79f24fe | 1767 | diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py |
4fa42d3f | 1768 | index dad410b..7975f1b 100644 |
f79f24fe MT |
1769 | --- a/urlgrabber/mirror.py |
1770 | +++ b/urlgrabber/mirror.py | |
4fa42d3f MT |
1771 | @@ -76,6 +76,9 @@ CUSTOMIZATION |
1772 | 'grabber' is omitted, the default grabber will be used. If | |
1773 | kwargs are omitted, then (duh) they will not be used. | |
1774 | ||
1775 | + kwarg 'max_connections' limits the number of concurrent | |
1776 | + connections to this mirror. | |
1777 | + | |
1778 | 3) Pass keyword arguments when instantiating the mirror group. | |
1779 | See, for example, the failure_callback argument. | |
1780 | ||
1781 | @@ -87,10 +90,14 @@ CUSTOMIZATION | |
1782 | """ | |
1783 | ||
1784 | ||
1785 | +import sys | |
f79f24fe MT |
1786 | import random |
1787 | import thread # needed for locking to make this threadsafe | |
1788 | ||
1789 | -from grabber import URLGrabError, CallbackObject, DEBUG | |
1790 | +from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 | |
4fa42d3f MT |
1791 | +from grabber import _run_callback, _do_raise |
1792 | +from grabber import exception2msg | |
1793 | +from grabber import _TH | |
f79f24fe MT |
1794 | |
1795 | def _(st): | |
1796 | return st | |
4fa42d3f MT |
1797 | @@ -126,7 +133,9 @@ class MirrorGroup: |
1798 | files) | |
1799 | ||
1800 | * if the local list is ever exhausted, a URLGrabError will be | |
1801 | - raised (errno=256, no more mirrors) | |
1802 | + raised (errno=256, No more mirrors). The 'errors' attribute | |
1803 | + holds a list of (full_url, errmsg) tuples. This contains | |
1804 | + all URLs tried and the corresponding error messages. | |
1805 | ||
1806 | OPTIONS | |
1807 | ||
1808 | @@ -153,7 +162,8 @@ class MirrorGroup: | |
1809 | ||
1810 | The 'fail' option will cause immediate failure by re-raising | |
1811 | the exception and no further attempts to get the current | |
1812 | - download. | |
1813 | + download. As in the "No more mirrors" case, the 'errors' | |
1814 | + attribute is set in the exception object. | |
1815 | ||
1816 | This dict can be set at instantiation time, | |
1817 | mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) | |
1818 | @@ -184,6 +194,7 @@ class MirrorGroup: | |
1819 | ||
1820 | obj.exception = < exception that was raised > | |
1821 | obj.mirror = < the mirror that was tried > | |
1822 | + obj.tries = < the number of mirror tries so far > | |
1823 | obj.relative_url = < url relative to the mirror > | |
1824 | obj.url = < full url that failed > | |
1825 | # .url is just the combination of .mirror | |
1826 | @@ -251,6 +262,17 @@ class MirrorGroup: | |
1827 | self.default_action = None | |
1828 | self._process_kwargs(kwargs) | |
1829 | ||
1830 | + # use the same algorithm as parallel downloader to initially sort | |
1831 | + # the mirror list (sort by speed, but prefer live private mirrors) | |
1832 | + def estimate(m): | |
1833 | + speed, fail = _TH.estimate(m['mirror']) | |
1834 | + private = not fail and m.get('kwargs', {}).get('private', False) | |
1835 | + return private, speed | |
1836 | + | |
1837 | + # update the initial order. since sorting is stable, the relative | |
1838 | + # order of unknown (not used yet) hosts is retained. | |
1839 | + self.mirrors.sort(key=estimate, reverse=True) | |
1840 | + | |
1841 | # if these values are found in **kwargs passed to one of the urlXXX | |
1842 | # methods, they will be stripped before getting passed on to the | |
1843 | # grabber | |
1844 | @@ -263,7 +285,8 @@ class MirrorGroup: | |
f79f24fe MT |
1845 | def _parse_mirrors(self, mirrors): |
1846 | parsed_mirrors = [] | |
1847 | for m in mirrors: | |
1848 | - if type(m) == type(''): m = {'mirror': m} | |
1849 | + if isinstance(m, basestring): | |
1850 | + m = {'mirror': _to_utf8(m)} | |
1851 | parsed_mirrors.append(m) | |
1852 | return parsed_mirrors | |
1853 | ||
4fa42d3f MT |
1854 | @@ -280,7 +303,9 @@ class MirrorGroup: |
1855 | # return a random mirror so that multiple mirrors get used | |
1856 | # even without failures. | |
1857 | if not gr.mirrors: | |
1858 | - raise URLGrabError(256, _('No more mirrors to try.')) | |
1859 | + e = URLGrabError(256, _('No more mirrors to try.')) | |
1860 | + e.errors = gr.errors | |
1861 | + raise e | |
1862 | return gr.mirrors[gr._next] | |
1863 | ||
1864 | def _failure(self, gr, cb_obj): | |
1865 | @@ -307,7 +332,9 @@ class MirrorGroup: | |
1866 | a.update(action) | |
1867 | action = a | |
1868 | self.increment_mirror(gr, action) | |
1869 | - if action and action.get('fail', 0): raise | |
1870 | + if action and action.get('fail', 0): | |
1871 | + sys.exc_info()[1].errors = gr.errors | |
1872 | + raise | |
1873 | ||
1874 | def increment_mirror(self, gr, action={}): | |
1875 | """Tell the mirror object increment the mirror index | |
1876 | @@ -377,35 +404,50 @@ class MirrorGroup: | |
1877 | gr.url = url | |
1878 | gr.kw = dict(kw) | |
1879 | self._load_gr(gr) | |
1880 | + gr.errors = [] | |
1881 | ||
1882 | for k in self.options: | |
1883 | try: del kw[k] | |
1884 | except KeyError: pass | |
1885 | ||
1886 | + tries = 0 | |
1887 | while 1: | |
1888 | + tries += 1 | |
1889 | mirrorchoice = self._get_mirror(gr) | |
1890 | fullurl = self._join_url(mirrorchoice['mirror'], gr.url) | |
1891 | - kwargs = dict(mirrorchoice.get('kwargs', {})) | |
1892 | - kwargs.update(kw) | |
1893 | grabber = mirrorchoice.get('grabber') or self.grabber | |
1894 | + # apply mirrorchoice kwargs on top of grabber.opts | |
1895 | + opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {})) | |
1896 | func_ref = getattr(grabber, func) | |
1897 | if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) | |
1898 | try: | |
1899 | - return func_ref( *(fullurl,), **kwargs ) | |
1900 | + return func_ref( *(fullurl,), opts=opts, **kw ) | |
1901 | except URLGrabError, e: | |
1902 | if DEBUG: DEBUG.info('MIRROR: failed') | |
1903 | + gr.errors.append((fullurl, exception2msg(e))) | |
1904 | obj = CallbackObject() | |
1905 | obj.exception = e | |
1906 | obj.mirror = mirrorchoice['mirror'] | |
1907 | obj.relative_url = gr.url | |
1908 | obj.url = fullurl | |
1909 | + obj.tries = tries | |
1910 | self._failure(gr, obj) | |
1911 | ||
1912 | def urlgrab(self, url, filename=None, **kwargs): | |
1913 | kw = dict(kwargs) | |
1914 | kw['filename'] = filename | |
1915 | + if kw.get('async'): | |
1916 | + # enable mirror failovers in async path | |
1917 | + kw['mirror_group'] = self, [], {}, set() | |
1918 | + kw['relative_url'] = url | |
1919 | + else: | |
1920 | + kw.pop('failfunc', None) | |
1921 | func = 'urlgrab' | |
1922 | - return self._mirror_try(func, url, kw) | |
1923 | + try: | |
1924 | + return self._mirror_try(func, url, kw) | |
1925 | + except URLGrabError, e: | |
1926 | + obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs) | |
1927 | + return _run_callback(kwargs.get('failfunc', _do_raise), obj) | |
1928 | ||
1929 | def urlopen(self, url, **kwargs): | |
1930 | kw = dict(kwargs) | |
f79f24fe | 1931 | diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py |
4fa42d3f | 1932 | index dd07c6a..077fd99 100644 |
f79f24fe MT |
1933 | --- a/urlgrabber/progress.py |
1934 | +++ b/urlgrabber/progress.py | |
4fa42d3f MT |
1935 | @@ -133,8 +133,8 @@ class BaseMeter: |
1936 | # for a real gui, you probably want to override and put a call | |
1937 | # to your mainloop iteration function here | |
1938 | if now is None: now = time.time() | |
1939 | - if (now >= self.last_update_time + self.update_period) or \ | |
1940 | - not self.last_update_time: | |
1941 | + if (not self.last_update_time or | |
1942 | + (now >= self.last_update_time + self.update_period)): | |
1943 | self.re.update(amount_read, now) | |
1944 | self.last_amount_read = amount_read | |
1945 | self.last_update_time = now | |
1946 | @@ -211,6 +211,21 @@ def text_meter_total_size(size, downloaded=0): | |
1947 | # 4. + ( 5, total: 32) | |
1948 | # | |
1949 | ||
1950 | +def _term_add_bar(tl, bar_max_length, pc): | |
1951 | + blen = bar_max_length | |
1952 | + bar = '='*int(blen * pc) | |
1953 | + if (blen * pc) - int(blen * pc) >= 0.5: | |
1954 | + bar += '-' | |
1955 | + return tl.add(' [%-*.*s]' % (blen, blen, bar)) | |
1956 | + | |
1957 | +def _term_add_end(tl, osize, size): | |
1958 | + if osize is not None: | |
1959 | + if size > osize: # Is ??? better? Really need something to say < vs >. | |
1960 | + return tl.add(' !!! '), True | |
1961 | + elif size != osize: | |
1962 | + return tl.add(' ... '), True | |
1963 | + return tl.add(' ' * 5), False | |
1964 | + | |
1965 | class TextMeter(BaseMeter): | |
1966 | def __init__(self, fo=sys.stderr): | |
1967 | BaseMeter.__init__(self) | |
1968 | @@ -218,7 +233,6 @@ class TextMeter(BaseMeter): | |
1969 | ||
1970 | def _do_update(self, amount_read, now=None): | |
1971 | etime = self.re.elapsed_time() | |
1972 | - fetime = format_time(etime) | |
1973 | fread = format_number(amount_read) | |
1974 | #self.size = None | |
1975 | if self.text is not None: | |
1976 | @@ -234,16 +248,20 @@ class TextMeter(BaseMeter): | |
1977 | ||
1978 | # Include text + ui_rate in minimal | |
1979 | tl = TerminalLine(8, 8+1+8) | |
1980 | + if tl._llen > 80: | |
1981 | + use_hours = True # For big screens, make it more readable. | |
1982 | + else: | |
1983 | + use_hours = False | |
1984 | ui_size = tl.add(' | %5sB' % fread) | |
1985 | if self.size is None: | |
1986 | - ui_time = tl.add(' %9s' % fetime) | |
1987 | + ui_time = tl.add(' %9s' % format_time(etime, use_hours)) | |
1988 | ui_end = tl.add(' ' * 5) | |
1989 | ui_rate = tl.add(' %5sB/s' % ave_dl) | |
1990 | out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, | |
1991 | ui_rate, ui_size, ui_time, ui_end) | |
1992 | else: | |
1993 | rtime = self.re.remaining_time() | |
1994 | - frtime = format_time(rtime) | |
1995 | + frtime = format_time(rtime, use_hours) | |
1996 | frac = self.re.fraction_read() | |
1997 | ||
1998 | ui_time = tl.add(' %9s' % frtime) | |
1999 | @@ -259,13 +277,10 @@ class TextMeter(BaseMeter): | |
2000 | ui_rate = tl.add(' %5sB/s' % ave_dl) | |
2001 | # Make text grow a bit before we start growing the bar too | |
2002 | blen = 4 + tl.rest_split(8 + 8 + 4) | |
2003 | - bar = '='*int(blen * frac) | |
2004 | - if (blen * frac) - int(blen * frac) >= 0.5: | |
2005 | - bar += '-' | |
2006 | - ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar)) | |
2007 | - out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, | |
2008 | - ui_sofar_pc, ui_pc, ui_bar, | |
2009 | - ui_rate, ui_size, ui_time, ui_end) | |
2010 | + ui_bar = _term_add_bar(tl, blen, frac) | |
2011 | + out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, | |
2012 | + ui_sofar_pc, ui_pc, ui_bar, | |
2013 | + ui_rate,ui_size,ui_time, ui_end) | |
2014 | ||
2015 | self.fo.write(out) | |
2016 | self.fo.flush() | |
2017 | @@ -274,7 +289,6 @@ class TextMeter(BaseMeter): | |
2018 | global _text_meter_total_size | |
2019 | global _text_meter_sofar_size | |
2020 | ||
2021 | - total_time = format_time(self.re.elapsed_time()) | |
2022 | total_size = format_number(amount_read) | |
2023 | if self.text is not None: | |
2024 | text = self.text | |
2025 | @@ -282,14 +296,13 @@ class TextMeter(BaseMeter): | |
2026 | text = self.basename | |
2027 | ||
2028 | tl = TerminalLine(8) | |
2029 | - ui_size = tl.add(' | %5sB' % total_size) | |
2030 | - ui_time = tl.add(' %9s' % total_time) | |
2031 | - not_done = self.size is not None and amount_read != self.size | |
2032 | - if not_done: | |
2033 | - ui_end = tl.add(' ... ') | |
2034 | + if tl._llen > 80: | |
2035 | + use_hours = True # For big screens, make it more readable. | |
2036 | else: | |
2037 | - ui_end = tl.add(' ' * 5) | |
2038 | - | |
2039 | + use_hours = False | |
2040 | + ui_size = tl.add(' | %5sB' % total_size) | |
2041 | + ui_time = tl.add(' %9s' % format_time(self.re.elapsed_time(),use_hours)) | |
2042 | + ui_end, not_done = _term_add_end(tl, self.size, amount_read) | |
2043 | out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text, | |
2044 | ui_size, ui_time, ui_end) | |
2045 | self.fo.write(out) | |
2046 | @@ -331,12 +344,21 @@ class MultiFileHelper(BaseMeter): | |
2047 | def message(self, message): | |
2048 | self.master.message_meter(self, message) | |
2049 | ||
2050 | +class _FakeLock: | |
2051 | + def acquire(self): | |
2052 | + pass | |
2053 | + def release(self): | |
2054 | + pass | |
2055 | + | |
2056 | class MultiFileMeter: | |
2057 | helperclass = MultiFileHelper | |
2058 | - def __init__(self): | |
2059 | + def __init__(self, threaded=True): | |
2060 | self.meters = [] | |
2061 | self.in_progress_meters = [] | |
2062 | - self._lock = thread.allocate_lock() | |
2063 | + if threaded: | |
2064 | + self._lock = thread.allocate_lock() | |
2065 | + else: | |
2066 | + self._lock = _FakeLock() | |
2067 | self.update_period = 0.3 # seconds | |
2068 | ||
2069 | self.numfiles = None | |
2070 | @@ -369,6 +391,7 @@ class MultiFileMeter: | |
2071 | ||
2072 | def end(self, now=None): | |
2073 | if now is None: now = time.time() | |
2074 | + self.re.update(self._amount_read(), now) | |
2075 | self._do_end(now) | |
2076 | ||
2077 | def _do_end(self, now): | |
2078 | @@ -407,8 +430,8 @@ class MultiFileMeter: | |
2079 | def update_meter(self, meter, now): | |
2080 | if not meter in self.meters: | |
2081 | raise ValueError('attempt to use orphaned meter') | |
2082 | - if (now >= self.last_update_time + self.update_period) or \ | |
2083 | - not self.last_update_time: | |
2084 | + if (not self.last_update_time or | |
2085 | + (now >= self.last_update_time + self.update_period)): | |
2086 | self.re.update(self._amount_read(), now) | |
2087 | self.last_update_time = now | |
2088 | self._do_update_meter(meter, now) | |
2089 | @@ -466,34 +489,87 @@ class MultiFileMeter: | |
2090 | ||
2091 | ||
2092 | class TextMultiFileMeter(MultiFileMeter): | |
2093 | - def __init__(self, fo=sys.stderr): | |
2094 | + def __init__(self, fo=sys.stderr, threaded=True): | |
2095 | self.fo = fo | |
2096 | - MultiFileMeter.__init__(self) | |
2097 | + MultiFileMeter.__init__(self, threaded) | |
2098 | + self.index_time = self.index = 0 | |
2099 | ||
2100 | # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:## | |
2101 | +# New output, like TextMeter output... | |
2102 | +# update: No size (minimal: 17 chars) | |
2103 | +# ----------------------------------- | |
2104 | +# (<#file>/<#tot files>): <text> <rate> | <current size> <elapsed> | |
2105 | +# 8-48 1 8 3 6 1 7-9 5 | |
2106 | +# | |
2107 | +# update: Size, All files | |
2108 | +# ----------------------- | |
2109 | +# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA | |
2110 | +# 8-22 1 3-4 1 6-12 1 8 3 6 1 7-9 1 3 1 | |
2111 | +# end | |
2112 | +# --- | |
2113 | +# <text> | <file size> <file elapsed time> | |
2114 | +# 8-56 3 6 1 9 5 | |
2115 | def _do_update_meter(self, meter, now): | |
2116 | self._lock.acquire() | |
2117 | try: | |
2118 | - format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \ | |
2119 | - "time: %8.8s/%8.8s" | |
2120 | df = self.finished_files | |
2121 | tf = self.numfiles or 1 | |
2122 | - pf = 100 * float(df)/tf + 0.49 | |
2123 | + # Don't use "percent of files complete" ... | |
2124 | + # pf = 100 * float(df)/tf + 0.49 | |
2125 | dd = self.re.last_amount_read | |
2126 | - td = self.total_size | |
2127 | + td = self.re.total | |
2128 | pd = 100 * (self.re.fraction_read() or 0) + 0.49 | |
2129 | dt = self.re.elapsed_time() | |
2130 | rt = self.re.remaining_time() | |
2131 | - if rt is None: tt = None | |
2132 | - else: tt = dt + rt | |
2133 | ||
2134 | - fdd = format_number(dd) + 'B' | |
2135 | - ftd = format_number(td) + 'B' | |
2136 | - fdt = format_time(dt, 1) | |
2137 | - ftt = format_time(tt, 1) | |
2138 | - | |
2139 | - out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt)) | |
2140 | - self.fo.write('\r' + out) | |
2141 | + frac = self.re.fraction_read() or 0 | |
2142 | + pf = 100 * frac | |
2143 | + ave_dl = format_number(self.re.average_rate()) | |
2144 | + | |
2145 | + # cycle through active meters | |
2146 | + if now > self.index_time: | |
2147 | + self.index_time = now + 1.0 | |
2148 | + self.index += 1 | |
2149 | + if self.index >= len(self.meters): | |
2150 | + self.index = 0 | |
2151 | + meter = self.meters[self.index] | |
2152 | + text = meter.text or meter.basename | |
2153 | + if tf > 1: | |
2154 | + text = '(%u/%u): %s' % (df+1+self.index, tf, text) | |
2155 | + | |
2156 | + # Include text + ui_rate in minimal | |
2157 | + tl = TerminalLine(8, 8+1+8) | |
2158 | + if tl._llen > 80: | |
2159 | + use_hours = True # For big screens, make it more readable. | |
2160 | + time_len = 9 | |
2161 | + else: | |
2162 | + use_hours = False | |
2163 | + time_len = 7 | |
2164 | + | |
2165 | + ui_size = tl.add(' | %5sB' % format_number(dd)) | |
2166 | + | |
2167 | + if not self.re.total: | |
2168 | + ui_time = tl.add(' %*s' % (time_len,format_time(dt, use_hours))) | |
2169 | + ui_end = tl.add(' ' * 5) | |
2170 | + ui_rate = tl.add(' %5sB/s' % ave_dl) | |
2171 | + out = '\r%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, | |
2172 | + ui_rate, ui_size, ui_time, ui_end) | |
2173 | + else: | |
2174 | + ui_time = tl.add(' %*s' % (time_len,format_time(rt, use_hours))) | |
2175 | + ui_end = tl.add(' ETA ') | |
2176 | + | |
2177 | + ui_sofar_pc = tl.add(' %i%%' % pf, | |
2178 | + full_len=len(" (100%)")) | |
2179 | + ui_rate = tl.add(' %5sB/s' % ave_dl) | |
2180 | + | |
2181 | + # Make text grow a bit before we start growing the bar too | |
2182 | + blen = 4 + tl.rest_split(8 + 8 + 4) | |
2183 | + ui_bar = _term_add_bar(tl, blen, frac) | |
2184 | + out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, | |
2185 | + ui_sofar_pc, ui_bar, | |
2186 | + ui_rate, ui_size, ui_time, | |
2187 | + ui_end) | |
2188 | + self.fo.write(out) | |
2189 | self.fo.flush() | |
2190 | finally: | |
2191 | self._lock.release() | |
2192 | @@ -502,24 +578,40 @@ class TextMultiFileMeter(MultiFileMeter): | |
2193 | self._lock.acquire() | |
2194 | try: | |
2195 | format = "%-30.30s %6.6s %8.8s %9.9s" | |
2196 | - fn = meter.basename | |
2197 | + fn = meter.text or meter.basename | |
2198 | size = meter.last_amount_read | |
2199 | fsize = format_number(size) + 'B' | |
2200 | et = meter.re.elapsed_time() | |
2201 | - fet = format_time(et, 1) | |
2202 | - frate = format_number(size / et) + 'B/s' | |
2203 | - | |
2204 | - out = '%-79.79s' % (format % (fn, fsize, fet, frate)) | |
2205 | - self.fo.write('\r' + out + '\n') | |
2206 | + frate = format_number(et and size / et) + 'B/s' | |
2207 | + df = self.finished_files | |
2208 | + tf = self.numfiles or 1 | |
2209 | + | |
2210 | + total_size = format_number(size) | |
2211 | + text = meter.text or meter.basename | |
2212 | + if tf > 1: | |
2213 | + text = '(%u/%u): %s' % (df, tf, text) | |
2214 | + | |
2215 | + tl = TerminalLine(8) | |
2216 | + if tl._llen > 80: | |
2217 | + use_hours = True # For big screens, make it more readable. | |
2218 | + time_len = 9 | |
2219 | + else: | |
2220 | + use_hours = False | |
2221 | + time_len = 7 | |
2222 | + ui_size = tl.add(' | %5sB' % total_size) | |
2223 | + ui_time = tl.add(' %*s' % (time_len, format_time(et, use_hours))) | |
2224 | + ui_end, not_done = _term_add_end(tl, meter.size, size) | |
2225 | + out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text, | |
2226 | + ui_size, ui_time, ui_end) | |
2227 | + self.fo.write(out) | |
2228 | finally: | |
2229 | self._lock.release() | |
2230 | - self._do_update_meter(meter, now) | |
2231 | ||
2232 | def _do_failure_meter(self, meter, message, now): | |
2233 | self._lock.acquire() | |
2234 | try: | |
2235 | format = "%-30.30s %6.6s %s" | |
2236 | - fn = meter.basename | |
2237 | + fn = meter.text or meter.basename | |
2238 | if type(message) in (type(''), type(u'')): | |
2239 | message = message.splitlines() | |
2240 | if not message: message = [''] | |
2241 | @@ -536,15 +628,6 @@ class TextMultiFileMeter(MultiFileMeter): | |
2242 | pass | |
2243 | finally: | |
2244 | self._lock.release() | |
2245 | - | |
2246 | - def _do_end(self, now): | |
2247 | - self._do_update_meter(None, now) | |
2248 | - self._lock.acquire() | |
2249 | - try: | |
2250 | - self.fo.write('\n') | |
2251 | - self.fo.flush() | |
2252 | - finally: | |
2253 | - self._lock.release() | |
2254 | ||
2255 | ###################################################################### | |
2256 | # support classes and functions | |
2257 | @@ -658,6 +741,8 @@ def format_time(seconds, use_hours=0): | |
f79f24fe MT |
2258 | if seconds is None or seconds < 0: |
2259 | if use_hours: return '--:--:--' | |
2260 | else: return '--:--' | |
2261 | + elif seconds == float('inf'): | |
2262 | + return 'Infinite' | |
2263 | else: | |
2264 | seconds = int(seconds) | |
2265 | minutes = seconds / 60 | |
4fa42d3f MT |
2266 | @@ -722,9 +807,77 @@ def _tst(fn, cur, tot, beg, size, *args): |
2267 | time.sleep(delay) | |
2268 | tm.end(size) | |
2269 | ||
2270 | +def _mtst(datas, *args): | |
2271 | + print '-' * 79 | |
2272 | + tm = TextMultiFileMeter(threaded=False) | |
2273 | + | |
2274 | + dl_sizes = {} | |
2275 | + | |
2276 | + num = 0 | |
2277 | + total_size = 0 | |
2278 | + dl_total_size = 0 | |
2279 | + for data in datas: | |
2280 | + dl_size = None | |
2281 | + if len(data) == 2: | |
2282 | + fn, size = data | |
2283 | + dl_size = size | |
2284 | + if len(data) == 3: | |
2285 | + fn, size, dl_size = data | |
2286 | + nm = tm.newMeter() | |
2287 | + nm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size, | |
2288 | + text=fn) | |
2289 | + num += 1 | |
2290 | + assert dl_size is not None | |
2291 | + dl_total_size += dl_size | |
2292 | + dl_sizes[nm] = dl_size | |
2293 | + if size is None or total_size is None: | |
2294 | + total_size = None | |
2295 | + else: | |
2296 | + total_size += size | |
2297 | + tm.start(num, total_size) | |
2298 | + | |
2299 | + num = 0 | |
2300 | + off = 0 | |
2301 | + for (inc, delay) in args: | |
2302 | + off += 1 | |
2303 | + while num < ((dl_total_size * off) / len(args)): | |
2304 | + num += inc | |
2305 | + for nm in tm.meters[:]: | |
2306 | + if dl_sizes[nm] <= num: | |
2307 | + nm.end(dl_sizes[nm]) | |
2308 | + tm.removeMeter(nm) | |
2309 | + else: | |
2310 | + nm.update(num) | |
2311 | + time.sleep(delay) | |
2312 | + assert not tm.meters | |
2313 | + | |
2314 | if __name__ == "__main__": | |
2315 | # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28 | |
2316 | # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08 | |
2317 | + if len(sys.argv) >= 2 and sys.argv[1] == 'multi': | |
2318 | + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), | |
2319 | + ("s-1.0.1-1.fc8.i386.rpm", 5000), | |
2320 | + ("m-1.0.1-2.fc8.i386.rpm", 10000)), | |
2321 | + (100, 0.33), (500, 0.25), (1000, 0.1)) | |
2322 | + | |
2323 | + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), | |
2324 | + ("s-1.0.1-1.fc8.i386.rpm", 5000), | |
2325 | + ("m-1.0.1-2.fc8.i386.rpm", None, 10000)), | |
2326 | + (100, 0.33), (500, 0.25), (1000, 0.1)) | |
2327 | + | |
2328 | + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), | |
2329 | + ("s-1.0.1-1.fc8.i386.rpm", 2500000), | |
2330 | + ("m-1.0.1-2.fc8.i386.rpm", 10000)), | |
2331 | + (10, 0.2), (50, 0.1), (1000, 0.1)) | |
2332 | + | |
2333 | + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), | |
2334 | + ("s-1.0.1-1.fc8.i386.rpm", None, 2500000), | |
2335 | + ("m-1.0.1-2.fc8.i386.rpm", None, 10000)), | |
2336 | + (10, 0.2), (50, 0.1), (1000, 0.1)) | |
2337 | + # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25)) | |
2338 | + # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25)) | |
2339 | + sys.exit(0) | |
2340 | + | |
2341 | if len(sys.argv) >= 2 and sys.argv[1] == 'total': | |
2342 | text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 + | |
2343 | 1000000 + 10000 + 10000 + 10000 + 1000000) |