]> git.ipfire.org Git - people/amarx/ipfire-3.x.git/blame - python-urlgrabber/patches/urlgrabber-HEAD.patch
python-urlgrabber: Import latest fixes from upstream.
[people/amarx/ipfire-3.x.git] / python-urlgrabber / patches / urlgrabber-HEAD.patch
CommitLineData
4fa42d3f
MT
1diff --git a/.gitignore b/.gitignore
2new file mode 100644
3index 0000000..1ffe416
4--- /dev/null
5+++ b/.gitignore
6@@ -0,0 +1,7 @@
7+*.py[co]
8+MANIFEST
9+dist
10+build
11+*.kdev*
12+*.kateproject
13+ipython.log*
f79f24fe
MT
14diff --git a/scripts/urlgrabber b/scripts/urlgrabber
15index 518e512..09cd896 100644
16--- a/scripts/urlgrabber
17+++ b/scripts/urlgrabber
18@@ -115,6 +115,7 @@ options:
19 including quotes in the case of strings.
20 e.g. --user_agent='"foobar/2.0"'
21
22+ --output FILE
23 -o FILE write output to FILE, otherwise the basename of the
24 url will be used
25 -O print the names of saved files to STDOUT
26@@ -170,12 +171,17 @@ class client_options:
27 return ug_options, ug_defaults
28
29 def process_command_line(self):
30- short_options = 'vd:hoOpD'
31+ short_options = 'vd:ho:OpD'
32 long_options = ['profile', 'repeat=', 'verbose=',
33- 'debug=', 'help', 'progress']
34+ 'debug=', 'help', 'progress', 'output=']
35 ug_long = [ o + '=' for o in self.ug_options ]
36- optlist, args = getopt.getopt(sys.argv[1:], short_options,
37- long_options + ug_long)
38+ try:
39+ optlist, args = getopt.getopt(sys.argv[1:], short_options,
40+ long_options + ug_long)
41+ except getopt.GetoptError, e:
42+ print >>sys.stderr, "Error:", e
43+ self.help([], ret=1)
44+
45 self.verbose = 0
46 self.debug = None
47 self.outputfile = None
48@@ -193,6 +199,7 @@ class client_options:
49 if o == '--verbose': self.verbose = v
50 if o == '-v': self.verbose += 1
51 if o == '-o': self.outputfile = v
52+ if o == '--output': self.outputfile = v
53 if o == '-p' or o == '--progress': self.progress = 1
54 if o == '-d' or o == '--debug': self.debug = v
55 if o == '--profile': self.profile = 1
56@@ -222,7 +229,7 @@ class client_options:
57 print "ERROR: cannot use -o when grabbing multiple files"
58 sys.exit(1)
59
60- def help(self, args):
61+ def help(self, args, ret=0):
62 if not args:
63 print MAINHELP
64 else:
65@@ -234,7 +241,7 @@ class client_options:
66 self.help_ug_option(a)
67 else:
68 print 'ERROR: no help on command "%s"' % a
69- sys.exit(0)
70+ sys.exit(ret)
71
72 def help_doc(self):
73 print __doc__
4fa42d3f
MT
74diff --git a/scripts/urlgrabber-ext-down b/scripts/urlgrabber-ext-down
75new file mode 100755
76index 0000000..3dafb12
77--- /dev/null
78+++ b/scripts/urlgrabber-ext-down
79@@ -0,0 +1,75 @@
80+#! /usr/bin/python
81+# A very simple external downloader
82+# Copyright 2011-2012 Zdenek Pavlas
83+
84+# This library is free software; you can redistribute it and/or
85+# modify it under the terms of the GNU Lesser General Public
86+# License as published by the Free Software Foundation; either
87+# version 2.1 of the License, or (at your option) any later version.
88+#
89+# This library is distributed in the hope that it will be useful,
90+# but WITHOUT ANY WARRANTY; without even the implied warranty of
91+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
92+# Lesser General Public License for more details.
93+#
94+# You should have received a copy of the GNU Lesser General Public
95+# License along with this library; if not, write to the
96+# Free Software Foundation, Inc.,
97+# 59 Temple Place, Suite 330,
98+# Boston, MA 02111-1307 USA
99+
100+import time, os, errno, sys
101+from urlgrabber.grabber import \
102+ _readlines, URLGrabberOptions, _loads, \
103+ PyCurlFileObject, URLGrabError
104+
105+def write(fmt, *arg):
106+ try: os.write(1, fmt % arg)
107+ except OSError, e:
108+ if e.args[0] != errno.EPIPE: raise
109+ sys.exit(1)
110+
111+class ProxyProgress:
112+ def start(self, *d1, **d2):
113+ self.next_update = 0
114+ def update(self, _amount_read):
115+ t = time.time()
116+ if t < self.next_update: return
117+ self.next_update = t + 0.31
118+ write('%d %d\n', self._id, _amount_read)
119+
120+def main():
121+ import signal
122+ signal.signal(signal.SIGINT, lambda n, f: sys.exit(1))
123+ cnt = 0
124+ while True:
125+ lines = _readlines(0)
126+ if not lines: break
127+ for line in lines:
128+ cnt += 1
129+ opts = URLGrabberOptions()
130+ opts._id = cnt
131+ for k in line.split(' '):
132+ k, v = k.split('=', 1)
133+ setattr(opts, k, _loads(v))
134+ if opts.progress_obj:
135+ opts.progress_obj = ProxyProgress()
136+ opts.progress_obj._id = cnt
137+
138+ dlsz = dltm = 0
139+ try:
140+ fo = PyCurlFileObject(opts.url, opts.filename, opts)
141+ fo._do_grab()
142+ fo.fo.close()
143+ size = fo._amount_read
144+ if fo._tm_last:
145+ dlsz = fo._tm_last[0] - fo._tm_first[0]
146+ dltm = fo._tm_last[1] - fo._tm_first[1]
147+ ug_err = 'OK'
148+ except URLGrabError, e:
149+ size = 0
150+ ug_err = '%d %s' % e.args
151+ write('%d %d %d %.3f %s\n', opts._id, size, dlsz, dltm, ug_err)
152+
153+if __name__ == '__main__':
154+ main()
155diff --git a/setup.py b/setup.py
156index d0b87b8..bfa4a18 100644
157--- a/setup.py
158+++ b/setup.py
159@@ -15,8 +15,10 @@ url = _urlgrabber.__url__
160 packages = ['urlgrabber']
161 package_dir = {'urlgrabber':'urlgrabber'}
162 scripts = ['scripts/urlgrabber']
163-data_files = [('share/doc/' + name + '-' + version,
164- ['README','LICENSE', 'TODO', 'ChangeLog'])]
165+data_files = [
166+ ('share/doc/' + name + '-' + version, ['README','LICENSE', 'TODO', 'ChangeLog']),
167+ ('libexec', ['scripts/urlgrabber-ext-down']),
168+]
169 options = { 'clean' : { 'all' : 1 } }
170 classifiers = [
171 'Development Status :: 4 - Beta',
f79f24fe
MT
172diff --git a/test/base_test_code.py b/test/base_test_code.py
173index 50c6348..5fb43f9 100644
174--- a/test/base_test_code.py
175+++ b/test/base_test_code.py
176@@ -1,6 +1,6 @@
177 from munittest import *
178
179-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
180+base_http = 'http://urlgrabber.baseurl.org/test/'
181 base_ftp = 'ftp://localhost/test/'
182
183 # set to a proftp server only. we're working around a couple of
184diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
185index 3e5f3b7..8eeaeda 100644
186--- a/urlgrabber/byterange.py
187+++ b/urlgrabber/byterange.py
188@@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler):
189
190 def http_error_416(self, req, fp, code, msg, hdrs):
191 # HTTP's Range Not Satisfiable error
192- raise RangeError('Requested Range Not Satisfiable')
193+ raise RangeError(9, 'Requested Range Not Satisfiable')
194
195 class HTTPSRangeHandler(HTTPRangeHandler):
196 """ Range Header support for HTTPS. """
197@@ -208,7 +208,7 @@ class RangeableFileObject:
198 bufsize = offset - pos
199 buf = self.fo.read(bufsize)
200 if len(buf) != bufsize:
201- raise RangeError('Requested Range Not Satisfiable')
202+ raise RangeError(9, 'Requested Range Not Satisfiable')
203 pos+= bufsize
204
205 class FileRangeHandler(urllib2.FileHandler):
206@@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler):
207 (fb,lb) = brange
208 if lb == '': lb = size
209 if fb < 0 or fb > size or lb > size:
210- raise RangeError('Requested Range Not Satisfiable')
211+ raise RangeError(9, 'Requested Range Not Satisfiable')
212 size = (lb - fb)
213 fo = RangeableFileObject(fo, (fb,lb))
214 headers = mimetools.Message(StringIO(
215@@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler):
216 (fb,lb) = range_tup
217 if lb == '':
218 if retrlen is None or retrlen == 0:
219- raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
220+ raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.')
221 lb = retrlen
222 retrlen = lb - fb
223 if retrlen < 0:
224 # beginning of range is larger than file
225- raise RangeError('Requested Range Not Satisfiable')
226+ raise RangeError(9, 'Requested Range Not Satisfiable')
227 else:
228 retrlen = lb - fb
229 fp = RangeableFileObject(fp, (0,retrlen))
230@@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup):
231 # check if range is over the entire file
232 if (fb,lb) == (0,''): return None
233 # check that the range is valid
234- if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
235+ if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
236 return (fb,lb)
237
238diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
4fa42d3f 239index e090e90..6ce9861 100644
f79f24fe
MT
240--- a/urlgrabber/grabber.py
241+++ b/urlgrabber/grabber.py
4fa42d3f
MT
242@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
243 progress_obj = None
244
245 a class instance that supports the following methods:
246- po.start(filename, url, basename, length, text)
247+ po.start(filename, url, basename, size, now, text)
248 # length will be None if unknown
249 po.update(read) # read == bytes read so far
250 po.end()
251
252+ multi_progress_obj = None
253+
254+ a class instance that supports the following methods:
255+ mo.start(total_files, total_size)
256+ mo.newMeter() => meter
257+ mo.removeMeter(meter)
258+ mo.end()
259+
260+ The 'meter' object is similar to progress_obj, but multiple
261+ instances may be created and updated at the same time.
262+
263+ When downloading multiple files in parallel and multi_progress_obj
264+ is None progress_obj is used in compatibility mode: finished files
265+ are shown but there's no in-progress display.
266+
267 text = None
268
269 specifies alternative text to be passed to the progress meter
270@@ -68,14 +83,14 @@ GENERAL ARGUMENTS (kwargs)
f79f24fe
MT
271 (which can be set on default_grabber.throttle) is used. See
272 BANDWIDTH THROTTLING for more information.
273
274- timeout = None
275+ timeout = 300
276
277- a positive float expressing the number of seconds to wait for socket
278- operations. If the value is None or 0.0, socket operations will block
279- forever. Setting this option causes urlgrabber to call the settimeout
280- method on the Socket object used for the request. See the Python
281- documentation on settimeout for more information.
282- http://www.python.org/doc/current/lib/socket-objects.html
283+ a positive integer expressing the number of seconds to wait before
284+ timing out attempts to connect to a server. If the value is None
285+ or 0, connection attempts will not time out. The timeout is passed
286+ to the underlying pycurl object as its CONNECTTIMEOUT option, see
287+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
288+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
289
290 bandwidth = 0
291
4fa42d3f
MT
292@@ -143,8 +158,12 @@ GENERAL ARGUMENTS (kwargs)
293 note that proxy authentication information may be provided using
294 normal URL constructs:
295 proxies={ 'http' : 'http://user:host@foo:3128' }
296- Lastly, if proxies is None, the default environment settings will
297- be used.
298+
299+ libproxy = False
300+
301+ Use the libproxy module (if installed) to find proxies.
302+ The libproxy code is only used if the proxies dictionary
303+ does not provide any proxies.
304
305 prefix = None
306
307@@ -198,6 +217,12 @@ GENERAL ARGUMENTS (kwargs)
f79f24fe
MT
308 control, you should probably subclass URLParser and pass it in via
309 the 'urlparser' option.
310
311+ username = None
312+ username to use for simple http auth - is automatically quoted for special characters
313+
314+ password = None
315+ password to use for simple http auth - is automatically quoted for special characters
316+
317 ssl_ca_cert = None
318
319 this option can be used if M2Crypto is available and will be
4fa42d3f
MT
320@@ -211,43 +236,75 @@ GENERAL ARGUMENTS (kwargs)
321 No-op when using the curl backend (default)
322
323
324- self.ssl_verify_peer = True
325+ ssl_verify_peer = True
326
327 Check the server's certificate to make sure it is valid with what our CA validates
328
329- self.ssl_verify_host = True
330+ ssl_verify_host = True
331
332 Check the server's hostname to make sure it matches the certificate DN
333
334- self.ssl_key = None
335+ ssl_key = None
336
337 Path to the key the client should use to connect/authenticate with
338
339- self.ssl_key_type = 'PEM'
340+ ssl_key_type = 'PEM'
341
342 PEM or DER - format of key
343
344- self.ssl_cert = None
345+ ssl_cert = None
346
347 Path to the ssl certificate the client should use to to authenticate with
348
349- self.ssl_cert_type = 'PEM'
350+ ssl_cert_type = 'PEM'
351
352 PEM or DER - format of certificate
353
354- self.ssl_key_pass = None
355+ ssl_key_pass = None
356
357 password to access the ssl_key
358
359- self.size = None
360+ size = None
361
362 size (in bytes) or Maximum size of the thing being downloaded.
363 This is mostly to keep us from exploding with an endless datastream
364
365- self.max_header_size = 2097152
366+ max_header_size = 2097152
f79f24fe
MT
367
368 Maximum size (in bytes) of the headers.
369
4fa42d3f 370+ ip_resolve = 'whatever'
f79f24fe
MT
371+
372+ What type of name to IP resolving to use, default is to do both IPV4 and
373+ IPV6.
4fa42d3f
MT
374+
375+ async = (key, limit)
376+
377+ When this option is set, the urlgrab() is not processed immediately
378+ but queued. parallel_wait() then processes grabs in parallel, limiting
379+ the numer of connections in each 'key' group to at most 'limit'.
380+
381+ max_connections
382+
383+ The global connection limit.
384+
385+ timedhosts
386+
387+ The filename of the host download statistics. If defined, urlgrabber
388+ will update the stats at the end of every download. At the end of
389+ parallel_wait(), the updated stats are saved. If synchronous grabs
390+ are used, you should call th_save().
391+
392+ default_speed, half_life
393+
394+ These options only affect the async mirror selection code.
395+ The default_speed option sets the speed estimate for mirrors
396+ we have never downloaded from, and defaults to 1 MBps.
397+
398+ The speed estimate also drifts exponentially from the speed
399+ actually measured to the default speed, with default
400+ period of 30 days.
f79f24fe
MT
401+
402
403 RETRY RELATED ARGUMENTS
404
4fa42d3f
MT
405@@ -328,6 +385,15 @@ RETRY RELATED ARGUMENTS
406 but it cannot (without severe trickiness) prevent the exception
407 from being raised.
408
409+ failfunc = None
410+
411+ The callback that gets called when urlgrab request fails.
412+ If defined, urlgrab() calls it instead of raising URLGrabError.
413+ Callback syntax is identical to failure_callback.
414+
415+ Contrary to failure_callback, it's called only once. It's primary
416+ purpose is to use urlgrab() without a try/except block.
417+
418 interrupt_callback = None
419
420 This callback is called if KeyboardInterrupt is received at any
421@@ -420,6 +486,7 @@ import time
f79f24fe
MT
422 import string
423 import urllib
424 import urllib2
425+from httplib import responses
426 import mimetools
427 import thread
428 import types
4fa42d3f
MT
429@@ -428,9 +495,17 @@ import pycurl
430 from ftplib import parse150
431 from StringIO import StringIO
432 from httplib import HTTPException
433-import socket
434+import socket, select, fcntl
435 from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
436
437+try:
438+ import xattr
439+ if not hasattr(xattr, 'set'):
440+ xattr = None # This is a "newer" API.
441+except ImportError:
442+ xattr = None
443+
444+
445 ########################################################################
446 # MODULE INITIALIZATION
447 ########################################################################
448@@ -439,6 +514,12 @@ try:
f79f24fe
MT
449 except:
450 __version__ = '???'
451
452+try:
453+ # this part isn't going to do much - need to talk to gettext
454+ from i18n import _
455+except ImportError, msg:
456+ def _(st): return st
457+
458 ########################################################################
459 # functions for debugging output. These functions are here because they
460 # are also part of the module initialization.
4fa42d3f
MT
461@@ -504,6 +585,7 @@ def _init_default_logger(logspec=None):
462 else: handler = logging.FileHandler(filename)
463 handler.setFormatter(formatter)
464 DBOBJ = logging.getLogger('urlgrabber')
465+ DBOBJ.propagate = False
466 DBOBJ.addHandler(handler)
467 DBOBJ.setLevel(level)
468 except (KeyError, ImportError, ValueError):
469@@ -512,8 +594,8 @@ def _init_default_logger(logspec=None):
470
471 def _log_package_state():
472 if not DEBUG: return
473- DEBUG.info('urlgrabber version = %s' % __version__)
474- DEBUG.info('trans function "_" = %s' % _)
475+ DEBUG.debug('urlgrabber version = %s' % __version__)
476+ DEBUG.debug('trans function "_" = %s' % _)
477
478 _init_default_logger()
479 _log_package_state()
480@@ -527,6 +609,29 @@ def _(st):
f79f24fe
MT
481 # END MODULE INITIALIZATION
482 ########################################################################
483
484+########################################################################
485+# UTILITY FUNCTIONS
486+########################################################################
487+
488+# These functions are meant to be utilities for the urlgrabber library to use.
489+
490+def _to_utf8(obj, errors='replace'):
491+ '''convert 'unicode' to an encoded utf-8 byte string '''
492+ # stolen from yum.i18n
493+ if isinstance(obj, unicode):
494+ obj = obj.encode('utf-8', errors)
495+ return obj
496+
4fa42d3f
MT
497+def exception2msg(e):
498+ try:
499+ return str(e)
500+ except UnicodeEncodeError:
501+ # always use byte strings
502+ return unicode(e).encode('utf8')
503+
f79f24fe
MT
504+########################################################################
505+# END UTILITY FUNCTIONS
506+########################################################################
507
508
509 class URLGrabError(IOError):
4fa42d3f 510@@ -662,6 +767,7 @@ class URLParser:
f79f24fe
MT
511 opts.quote = 0 --> do not quote it
512 opts.quote = None --> guess
513 """
514+ url = _to_utf8(url)
515 quote = opts.quote
516
517 if opts.prefix:
4fa42d3f
MT
518@@ -768,6 +874,41 @@ class URLGrabberOptions:
519 else: # throttle is a float
520 return self.bandwidth * self.throttle
521
522+ def find_proxy(self, url, scheme):
523+ """Find the proxy to use for this URL.
524+ Use the proxies dictionary first, then libproxy.
525+ """
526+ self.proxy = None
527+ if scheme not in ('ftp', 'http', 'https'):
528+ return
529+
530+ if self.proxies:
531+ proxy = self.proxies.get(scheme)
532+ if proxy is None:
533+ if scheme == 'http':
534+ proxy = self.proxies.get('https')
535+ elif scheme == 'https':
536+ proxy = self.proxies.get('http')
537+ if proxy == '_none_':
538+ proxy = ''
539+ self.proxy = proxy
540+ return
541+
542+ if self.libproxy:
543+ global _libproxy_cache
544+ if _libproxy_cache is None:
545+ try:
546+ import libproxy
547+ _libproxy_cache = libproxy.ProxyFactory()
548+ except:
549+ _libproxy_cache = False
550+ if _libproxy_cache:
551+ for proxy in _libproxy_cache.getProxies(url):
552+ if proxy.startswith('http://'):
553+ if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url))
554+ self.proxy = proxy
555+ break
556+
557 def derive(self, **kwargs):
558 """Create a derived URLGrabberOptions instance.
559 This method creates a new instance and overrides the
560@@ -791,30 +932,37 @@ class URLGrabberOptions:
561 provided here.
562 """
563 self.progress_obj = None
564+ self.multi_progress_obj = None
565 self.throttle = 1.0
566 self.bandwidth = 0
567 self.retry = None
568 self.retrycodes = [-1,2,4,5,6,7]
569 self.checkfunc = None
570+ self.failfunc = _do_raise
571 self.copy_local = 0
f79f24fe
MT
572 self.close_connection = 0
573 self.range = None
574 self.user_agent = 'urlgrabber/%s' % __version__
575+ self.ip_resolve = None
576 self.keepalive = 1
577 self.proxies = None
4fa42d3f
MT
578+ self.libproxy = False
579+ self.proxy = None
f79f24fe 580 self.reget = None
4fa42d3f
MT
581 self.failure_callback = None
582 self.interrupt_callback = None
f79f24fe
MT
583 self.prefix = None
584 self.opener = None
585 self.cache_openers = True
586- self.timeout = None
587+ self.timeout = 300
588 self.text = None
589 self.http_headers = None
590 self.ftp_headers = None
591 self.data = None
592 self.urlparser = URLParser()
593 self.quote = None
594+ self.username = None
595+ self.password = None
596 self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
597 self.ssl_context = None # no-op in pycurl
598 self.ssl_verify_peer = True # check peer's cert for authenticityb
4fa42d3f
MT
599@@ -827,6 +975,12 @@ class URLGrabberOptions:
600 self.size = None # if we know how big the thing we're getting is going
601 # to be. this is ultimately a MAXIMUM size for the file
602 self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
603+ self.async = None # blocking by default
604+ self.mirror_group = None
605+ self.max_connections = 5
606+ self.timedhosts = None
607+ self.half_life = 30*24*60*60 # 30 days
608+ self.default_speed = 1e6 # 1 MBit
609
610 def __repr__(self):
611 return self.format()
612@@ -846,7 +1000,18 @@ class URLGrabberOptions:
f79f24fe
MT
613 s = s + indent + '}'
614 return s
615
616-class URLGrabber:
4fa42d3f
MT
617+def _do_raise(obj):
618+ raise obj.exception
619+
620+def _run_callback(cb, obj):
621+ if not cb:
622+ return
623+ if callable(cb):
624+ return cb(obj)
625+ cb, arg, karg = cb
626+ return cb(obj, *arg, **karg)
627+
f79f24fe
MT
628+class URLGrabber(object):
629 """Provides easy opening of URLs with a variety of options.
630
631 All options are specified as kwargs. Options may be specified when
4fa42d3f
MT
632@@ -872,7 +1037,6 @@ class URLGrabber:
633 # beware of infinite loops :)
634 tries = tries + 1
635 exception = None
636- retrycode = None
637 callback = None
638 if DEBUG: DEBUG.info('attempt %i/%s: %s',
639 tries, opts.retry, args[0])
640@@ -883,54 +1047,62 @@ class URLGrabber:
641 except URLGrabError, e:
642 exception = e
643 callback = opts.failure_callback
644- retrycode = e.errno
645 except KeyboardInterrupt, e:
646 exception = e
647 callback = opts.interrupt_callback
648+ if not callback:
649+ raise
650
651 if DEBUG: DEBUG.info('exception: %s', exception)
652 if callback:
653 if DEBUG: DEBUG.info('calling callback: %s', callback)
654- cb_func, cb_args, cb_kwargs = self._make_callback(callback)
655 obj = CallbackObject(exception=exception, url=args[0],
656 tries=tries, retry=opts.retry)
657- cb_func(obj, *cb_args, **cb_kwargs)
658+ _run_callback(callback, obj)
659
660 if (opts.retry is None) or (tries == opts.retry):
661 if DEBUG: DEBUG.info('retries exceeded, re-raising')
662 raise
663
664+ retrycode = getattr(exception, 'errno', None)
665 if (retrycode is not None) and (retrycode not in opts.retrycodes):
666 if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
667 retrycode, opts.retrycodes)
668 raise
669
670- def urlopen(self, url, **kwargs):
671+ def urlopen(self, url, opts=None, **kwargs):
672 """open the url and return a file object
673 If a progress object or throttle value specified when this
674 object was created, then a special file object will be
675 returned that supports them. The file object can be treated
676 like any other file object.
677 """
678- opts = self.opts.derive(**kwargs)
679+ url = _to_utf8(url)
680+ opts = (opts or self.opts).derive(**kwargs)
681 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
682 (url,parts) = opts.urlparser.parse(url, opts)
683+ opts.find_proxy(url, parts[0])
684 def retryfunc(opts, url):
685 return PyCurlFileObject(url, filename=None, opts=opts)
686 return self._retry(opts, retryfunc, url)
687
688- def urlgrab(self, url, filename=None, **kwargs):
689+ def urlgrab(self, url, filename=None, opts=None, **kwargs):
690 """grab the file at <url> and make a local copy at <filename>
691 If filename is none, the basename of the url is used.
692 urlgrab returns the filename of the local file, which may be
693 different from the passed-in filename if copy_local == 0.
694 """
695- opts = self.opts.derive(**kwargs)
696+ url = _to_utf8(url)
697+ opts = (opts or self.opts).derive(**kwargs)
698 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
699 (url,parts) = opts.urlparser.parse(url, opts)
f79f24fe 700 (scheme, host, path, parm, query, frag) = parts
4fa42d3f 701+ opts.find_proxy(url, scheme)
f79f24fe
MT
702 if filename is None:
703 filename = os.path.basename( urllib.unquote(path) )
704+ if not filename:
705+ # This is better than nothing.
706+ filename = 'index.html'
707 if scheme == 'file' and not opts.copy_local:
708 # just return the name of the local file - don't make a
709 # copy currently
4fa42d3f
MT
710@@ -950,41 +1122,51 @@ class URLGrabber:
711
712 elif not opts.range:
713 if not opts.checkfunc is None:
714- cb_func, cb_args, cb_kwargs = \
715- self._make_callback(opts.checkfunc)
716- obj = CallbackObject()
717- obj.filename = path
718- obj.url = url
719- apply(cb_func, (obj, )+cb_args, cb_kwargs)
720+ obj = CallbackObject(filename=path, url=url)
721+ _run_callback(opts.checkfunc, obj)
722 return path
723
724+ if opts.async:
725+ opts.url = url
726+ opts.filename = filename
727+ opts.size = int(opts.size or 0)
728+ _async_queue.append(opts)
729+ return filename
730+
731 def retryfunc(opts, url, filename):
732 fo = PyCurlFileObject(url, filename, opts)
733 try:
734 fo._do_grab()
735+ if fo._tm_last:
736+ dlsz = fo._tm_last[0] - fo._tm_first[0]
737+ dltm = fo._tm_last[1] - fo._tm_first[1]
738+ _TH.update(url, dlsz, dltm, None)
739 if not opts.checkfunc is None:
740- cb_func, cb_args, cb_kwargs = \
741- self._make_callback(opts.checkfunc)
742- obj = CallbackObject()
743- obj.filename = filename
744- obj.url = url
745- apply(cb_func, (obj, )+cb_args, cb_kwargs)
746+ obj = CallbackObject(filename=filename, url=url)
747+ _run_callback(opts.checkfunc, obj)
748 finally:
749 fo.close()
750 return filename
751
752- return self._retry(opts, retryfunc, url, filename)
753+ try:
754+ return self._retry(opts, retryfunc, url, filename)
755+ except URLGrabError, e:
756+ _TH.update(url, 0, 0, e)
757+ opts.exception = e
758+ return _run_callback(opts.failfunc, opts)
759
760- def urlread(self, url, limit=None, **kwargs):
761+ def urlread(self, url, limit=None, opts=None, **kwargs):
762 """read the url into a string, up to 'limit' bytes
763 If the limit is exceeded, an exception will be thrown. Note
764 that urlread is NOT intended to be used as a way of saying
765 "I want the first N bytes" but rather 'read the whole file
766 into memory, but don't use too much'
767 """
768- opts = self.opts.derive(**kwargs)
769+ url = _to_utf8(url)
770+ opts = (opts or self.opts).derive(**kwargs)
771 if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
772 (url,parts) = opts.urlparser.parse(url, opts)
773+ opts.find_proxy(url, parts[0])
774 if limit is not None:
775 limit = limit + 1
776
777@@ -1000,12 +1182,8 @@ class URLGrabber:
778 else: s = fo.read(limit)
779
780 if not opts.checkfunc is None:
781- cb_func, cb_args, cb_kwargs = \
782- self._make_callback(opts.checkfunc)
783- obj = CallbackObject()
784- obj.data = s
785- obj.url = url
786- apply(cb_func, (obj, )+cb_args, cb_kwargs)
787+ obj = CallbackObject(data=s, url=url)
788+ _run_callback(opts.checkfunc, obj)
789 finally:
790 fo.close()
791 return s
792@@ -1020,6 +1198,7 @@ class URLGrabber:
793 return s
794
795 def _make_callback(self, callback_obj):
796+ # not used, left for compatibility
797 if callable(callback_obj):
798 return callback_obj, (), {}
799 else:
800@@ -1030,7 +1209,7 @@ class URLGrabber:
f79f24fe
MT
801 default_grabber = URLGrabber()
802
803
804-class PyCurlFileObject():
805+class PyCurlFileObject(object):
806 def __init__(self, url, filename, opts):
807 self.fo = None
808 self._hdr_dump = ''
4fa42d3f 809@@ -1052,10 +1231,13 @@ class PyCurlFileObject():
f79f24fe
MT
810 self._reget_length = 0
811 self._prog_running = False
812 self._error = (None, None)
813- self.size = None
814+ self.size = 0
815+ self._hdr_ended = False
4fa42d3f
MT
816+ self._tm_first = None
817+ self._tm_last = None
f79f24fe
MT
818 self._do_open()
819
4fa42d3f 820-
f79f24fe 821+
f79f24fe
MT
822 def __getattr__(self, name):
823 """This effectively allows us to wrap at the instance level.
4fa42d3f
MT
824 Any attribute not found in _this_ object will be searched for
825@@ -1067,6 +1249,12 @@ class PyCurlFileObject():
826
827 def _retrieve(self, buf):
828 try:
829+ tm = self._amount_read + len(buf), time.time()
830+ if self._tm_first is None:
831+ self._tm_first = tm
832+ else:
833+ self._tm_last = tm
834+
835 if not self._prog_running:
836 if self.opts.progress_obj:
837 size = self.size + self._reget_length
838@@ -1079,15 +1267,24 @@ class PyCurlFileObject():
839 self.opts.progress_obj.update(self._amount_read)
840
841 self._amount_read += len(buf)
842- self.fo.write(buf)
843+ try:
844+ self.fo.write(buf)
845+ except IOError, e:
846+ self._cb_error = URLGrabError(16, exception2msg(e))
847+ return -1
848 return len(buf)
849 except KeyboardInterrupt:
f79f24fe
MT
850 return -1
851
852 def _hdr_retrieve(self, buf):
853+ if self._hdr_ended:
854+ self._hdr_dump = ''
855+ self.size = 0
856+ self._hdr_ended = False
857+
858 if self._over_max_size(cur=len(self._hdr_dump),
859 max_size=self.opts.max_header_size):
860- return -1
861+ return -1
862 try:
863 self._hdr_dump += buf
864 # we have to get the size before we do the progress obj start
4fa42d3f 865@@ -1104,7 +1301,17 @@ class PyCurlFileObject():
f79f24fe
MT
866 s = parse150(buf)
867 if s:
868 self.size = int(s)
869-
870+
871+ if buf.lower().find('location') != -1:
872+ location = ':'.join(buf.split(':')[1:])
873+ location = location.strip()
874+ self.scheme = urlparse.urlsplit(location)[0]
875+ self.url = location
876+
877+ if len(self._hdr_dump) != 0 and buf == '\r\n':
878+ self._hdr_ended = True
4fa42d3f 879+ if DEBUG: DEBUG.debug('header ended:')
f79f24fe
MT
880+
881 return len(buf)
882 except KeyboardInterrupt:
883 return pycurl.READFUNC_ABORT
4fa42d3f 884@@ -1113,8 +1320,10 @@ class PyCurlFileObject():
f79f24fe
MT
885 if self._parsed_hdr:
886 return self._parsed_hdr
887 statusend = self._hdr_dump.find('\n')
888+ statusend += 1 # ridiculous as it may seem.
889 hdrfp = StringIO()
890 hdrfp.write(self._hdr_dump[statusend:])
891+ hdrfp.seek(0)
892 self._parsed_hdr = mimetools.Message(hdrfp)
893 return self._parsed_hdr
894
4fa42d3f
MT
895@@ -1127,6 +1336,9 @@ class PyCurlFileObject():
896 if not opts:
897 opts = self.opts
898
899+ # keepalives
900+ if not opts.keepalive:
901+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
902
903 # defaults we're always going to set
904 self.curl_obj.setopt(pycurl.NOPROGRESS, False)
905@@ -1136,11 +1348,21 @@ class PyCurlFileObject():
f79f24fe
MT
906 self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
907 self.curl_obj.setopt(pycurl.FAILONERROR, True)
908 self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
909+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
910
4fa42d3f
MT
911- if DEBUG:
912+ if DEBUG and DEBUG.level <= 10:
f79f24fe
MT
913 self.curl_obj.setopt(pycurl.VERBOSE, True)
914 if opts.user_agent:
915 self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
916+ if opts.ip_resolve:
917+ # Default is: IPRESOLVE_WHATEVER
918+ ipr = opts.ip_resolve.lower()
919+ if ipr == 'whatever': # Do we need this?
920+ self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER)
921+ if ipr == 'ipv4':
922+ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
923+ if ipr == 'ipv6':
924+ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6)
925
926 # maybe to be options later
927 self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
4fa42d3f 928@@ -1148,9 +1370,11 @@ class PyCurlFileObject():
f79f24fe
MT
929
930 # timeouts
931 timeout = 300
932- if opts.timeout:
933- timeout = int(opts.timeout)
934- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
935+ if hasattr(opts, 'timeout'):
936+ timeout = int(opts.timeout or 0)
937+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
4fa42d3f 938+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1000)
f79f24fe
MT
939+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
940
941 # ssl options
942 if self.scheme == 'https':
4fa42d3f
MT
943@@ -1158,13 +1382,16 @@ class PyCurlFileObject():
944 self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
945 self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
946 self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
947- self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
948+ if opts.ssl_verify_host: # 1 is meaningless to curl
949+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2)
950 if opts.ssl_key:
951 self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
952 if opts.ssl_key_type:
953 self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
954 if opts.ssl_cert:
955 self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
956+ # if we have a client side cert - turn off reuse b/c nss is odd
957+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
958 if opts.ssl_cert_type:
959 self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
960 if opts.ssl_key_pass:
961@@ -1187,28 +1414,26 @@ class PyCurlFileObject():
962 if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
963 self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
f79f24fe 964
4fa42d3f
MT
965- # proxy settings
966- if opts.proxies:
967- for (scheme, proxy) in opts.proxies.items():
968- if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
969- if scheme not in ('ftp'):
970- continue
971- else:
972- if proxy == '_none_': proxy = ""
973- self.curl_obj.setopt(pycurl.PROXY, proxy)
974- elif self.scheme in ('http', 'https'):
975- if scheme not in ('http', 'https'):
976- continue
977- else:
978- if proxy == '_none_': proxy = ""
979- self.curl_obj.setopt(pycurl.PROXY, proxy)
980-
f79f24fe 981- # FIXME username/password/auth settings
4fa42d3f
MT
982+ # proxy
983+ if opts.proxy is not None:
984+ self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
985+ self.curl_obj.setopt(pycurl.PROXYAUTH,
986+ # All but Kerberos. BZ 769254
987+ pycurl.HTTPAUTH_ANY - pycurl.HTTPAUTH_GSSNEGOTIATE)
988+
f79f24fe
MT
989+ if opts.username and opts.password:
990+ if self.scheme in ('http', 'https'):
991+ self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
992+
993+ if opts.username and opts.password:
994+ # apparently when applying them as curlopts they do not require quoting of any kind
995+ userpwd = '%s:%s' % (opts.username, opts.password)
996+ self.curl_obj.setopt(pycurl.USERPWD, userpwd)
997
998 #posts - simple - expects the fields as they are
999 if opts.data:
1000 self.curl_obj.setopt(pycurl.POST, True)
1001- self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
1002+ self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data))
1003
1004 # our url
1005 self.curl_obj.setopt(pycurl.URL, self.url)
4fa42d3f 1006@@ -1228,39 +1453,36 @@ class PyCurlFileObject():
f79f24fe
MT
1007
1008 code = self.http_code
1009 errcode = e.args[0]
1010+ errurl = urllib.unquote(self.url)
1011+
1012 if self._error[0]:
1013 errcode = self._error[0]
1014
4fa42d3f 1015- if errcode == 23 and code >= 200 and code < 299:
f79f24fe
MT
1016- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
1017- err.url = self.url
4fa42d3f
MT
1018-
1019+ if errcode == 23 and 200 <= code <= 299:
f79f24fe
MT
1020 # this is probably wrong but ultimately this is what happens
1021 # we have a legit http code and a pycurl 'writer failed' code
4fa42d3f
MT
1022 # which almost always means something aborted it from outside
1023 # since we cannot know what it is -I'm banking on it being
1024 # a ctrl-c. XXXX - if there's a way of going back two raises to
1025 # figure out what aborted the pycurl process FIXME
1026- raise KeyboardInterrupt
1027+ raise getattr(self, '_cb_error', KeyboardInterrupt)
f79f24fe
MT
1028
1029 elif errcode == 28:
1030- err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
1031- err.url = self.url
1032+ err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e))
1033+ err.url = errurl
1034 raise err
1035 elif errcode == 35:
1036 msg = _("problem making ssl connection")
1037 err = URLGrabError(14, msg)
1038- err.url = self.url
1039+ err.url = errurl
1040 raise err
1041 elif errcode == 37:
1042- msg = _("Could not open/read %s") % (self.url)
1043+ msg = _("Could not open/read %s") % (errurl)
1044 err = URLGrabError(14, msg)
1045- err.url = self.url
1046+ err.url = errurl
1047 raise err
1048
1049 elif errcode == 42:
1050- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
1051- err.url = self.url
f79f24fe
MT
1052 # this is probably wrong but ultimately this is what happens
1053 # we have a legit http code and a pycurl 'writer failed' code
1054 # which almost always means something aborted it from outside
4fa42d3f 1055@@ -1272,33 +1494,94 @@ class PyCurlFileObject():
f79f24fe
MT
1056 elif errcode == 58:
1057 msg = _("problem with the local client certificate")
1058 err = URLGrabError(14, msg)
1059- err.url = self.url
1060+ err.url = errurl
1061 raise err
1062
1063 elif errcode == 60:
1064- msg = _("client cert cannot be verified or client cert incorrect")
1065+ msg = _("Peer cert cannot be verified or peer cert invalid")
1066 err = URLGrabError(14, msg)
1067- err.url = self.url
1068+ err.url = errurl
1069 raise err
1070
1071 elif errcode == 63:
1072 if self._error[1]:
1073 msg = self._error[1]
1074 else:
1075- msg = _("Max download size exceeded on %s") % (self.url)
1076+ msg = _("Max download size exceeded on %s") % ()
1077 err = URLGrabError(14, msg)
1078- err.url = self.url
1079+ err.url = errurl
1080 raise err
1081
4fa42d3f 1082- elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
f79f24fe 1083- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
4fa42d3f 1084+ elif str(e.args[1]) == '' and code and not 200 <= code <= 299:
f79f24fe
MT
1085+ if self.scheme in ['http', 'https']:
1086+ if self.http_code in responses:
1087+ resp = responses[self.http_code]
1088+ msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl)
1089+ else:
1090+ msg = 'HTTP Error %s : %s ' % (self.http_code, errurl)
1091+ elif self.scheme in ['ftp']:
1092+ msg = 'FTP Error %s : %s ' % (self.http_code, errurl)
1093+ else:
1094+ msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme)
1095 else:
1096- msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
1097+ pyerr2str = { 5 : _("Couldn't resolve proxy"),
1098+ 6 : _("Couldn't resolve host"),
1099+ 7 : _("Couldn't connect"),
1100+ 8 : _("Bad reply to FTP server"),
1101+ 9 : _("Access denied"),
1102+ 11 : _("Bad reply to FTP pass"),
1103+ 13 : _("Bad reply to FTP pasv"),
1104+ 14 : _("Bad reply to FTP 227"),
1105+ 15 : _("Couldn't get FTP host"),
1106+ 17 : _("Couldn't set FTP type"),
1107+ 18 : _("Partial file"),
1108+ 19 : _("FTP RETR command failed"),
1109+ 22 : _("HTTP returned error"),
1110+ 23 : _("Write error"),
1111+ 25 : _("Upload failed"),
1112+ 26 : _("Read error"),
1113+ 27 : _("Out of Memory"),
1114+ 28 : _("Operation timed out"),
1115+ 30 : _("FTP PORT command failed"),
1116+ 31 : _("FTP REST command failed"),
1117+ 33 : _("Range failed"),
1118+ 34 : _("HTTP POST failed"),
1119+ 35 : _("SSL CONNECT failed"),
1120+ 36 : _("Couldn't resume download"),
1121+ 37 : _("Couldn't read file"),
1122+ 42 : _("Aborted by callback"),
1123+ 47 : _("Too many redirects"),
1124+ 51 : _("Peer certificate failed verification"),
4fa42d3f 1125+ 52 : _("Got nothing: SSL certificate expired?"),
f79f24fe
MT
1126+ 53 : _("SSL engine not found"),
1127+ 54 : _("SSL engine set failed"),
1128+ 55 : _("Network error send()"),
1129+ 56 : _("Network error recv()"),
1130+ 58 : _("Local certificate failed"),
1131+ 59 : _("SSL set cipher failed"),
1132+ 60 : _("Local CA certificate failed"),
1133+ 61 : _("HTTP bad transfer encoding"),
1134+ 63 : _("Maximum file size exceeded"),
1135+ 64 : _("FTP SSL failed"),
1136+ 67 : _("Authentication failure"),
1137+ 70 : _("Out of disk space on server"),
1138+ 73 : _("Remove file exists"),
1139+ }
1140+ errstr = str(e.args[1])
1141+ if not errstr:
1142+ errstr = pyerr2str.get(errcode, '<Unknown>')
1143+ msg = 'curl#%s - "%s"' % (errcode, errstr)
1144 code = errcode
1145 err = URLGrabError(14, msg)
1146 err.code = code
1147 err.exception = e
1148 raise err
1149+ else:
1150+ if self._error[1]:
1151+ msg = self._error[1]
4fa42d3f 1152+ err = URLGrabError(14, msg)
f79f24fe
MT
1153+ err.url = urllib.unquote(self.url)
1154+ raise err
1155
1156 def _do_open(self):
1157 self.curl_obj = _curl_cache
4fa42d3f 1158@@ -1333,7 +1616,11 @@ class PyCurlFileObject():
f79f24fe
MT
1159
1160 if self.opts.range:
1161 rt = self.opts.range
1162- if rt[0]: rt = (rt[0] + reget_length, rt[1])
1163+
1164+ if rt[0] is None:
1165+ rt = (0, rt[1])
1166+ rt = (rt[0] + reget_length, rt[1])
1167+
1168
1169 if rt:
1170 header = range_tuple_to_header(rt)
4fa42d3f 1171@@ -1434,21 +1721,46 @@ class PyCurlFileObject():
f79f24fe
MT
1172 #fh, self._temp_name = mkstemp()
1173 #self.fo = open(self._temp_name, 'wb')
1174
1175-
1176- self._do_perform()
1177-
4fa42d3f
MT
1178-
1179-
f79f24fe
MT
1180+ try:
1181+ self._do_perform()
1182+ except URLGrabError, e:
1183+ self.fo.flush()
1184+ self.fo.close()
1185+ raise e
1186+
f79f24fe 1187 if _was_filename:
4fa42d3f
MT
1188 # close it up
1189 self.fo.flush()
1190 self.fo.close()
1191+
1192+ # Set the URL where we got it from:
1193+ if xattr is not None:
1194+ # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
1195+ try:
1196+ xattr.set(self.filename, 'user.xdg.origin.url', self.url)
1197+ except:
1198+ pass # URL too long. = IOError ... ignore everything.
1199+
f79f24fe
MT
1200 # set the time
1201 mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
1202 if mod_time != -1:
1203- os.utime(self.filename, (mod_time, mod_time))
1204+ try:
1205+ os.utime(self.filename, (mod_time, mod_time))
1206+ except OSError, e:
1207+ err = URLGrabError(16, _(\
1208+ 'error setting timestamp on file %s from %s, OSError: %s')
1209+ % (self.filename, self.url, e))
1210+ err.url = self.url
1211+ raise err
1212 # re open it
1213- self.fo = open(self.filename, 'r')
1214+ try:
1215+ self.fo = open(self.filename, 'r')
1216+ except IOError, e:
1217+ err = URLGrabError(16, _(\
1218+ 'error opening file from %s, IOError: %s') % (self.url, e))
1219+ err.url = self.url
1220+ raise err
1221+
1222 else:
1223 #self.fo = open(self._temp_name, 'r')
1224 self.fo.seek(0)
4fa42d3f
MT
1225@@ -1526,17 +1838,20 @@ class PyCurlFileObject():
1226 if self._prog_running:
1227 downloaded += self._reget_length
1228 self.opts.progress_obj.update(downloaded)
1229- except KeyboardInterrupt:
1230+ except (KeyboardInterrupt, IOError):
1231 return -1
1232
f79f24fe
MT
1233 def _over_max_size(self, cur, max_size=None):
1234
1235 if not max_size:
1236- max_size = self.size
1237- if self.opts.size: # if we set an opts size use that, no matter what
1238- max_size = self.opts.size
1239+ if not self.opts.size:
1240+ max_size = self.size
1241+ else:
1242+ max_size = self.opts.size
1243+
1244 if not max_size: return False # if we have None for all of the Max then this is dumb
1245- if cur > max_size + max_size*.10:
1246+
1247+ if cur > int(float(max_size) * 1.10):
1248
1249 msg = _("Downloaded more than max size for %s: %s > %s") \
1250 % (self.url, cur, max_size)
4fa42d3f 1251@@ -1544,13 +1859,6 @@ class PyCurlFileObject():
f79f24fe
MT
1252 return True
1253 return False
1254
1255- def _to_utf8(self, obj, errors='replace'):
1256- '''convert 'unicode' to an encoded utf-8 byte string '''
1257- # stolen from yum.i18n
1258- if isinstance(obj, unicode):
1259- obj = obj.encode('utf-8', errors)
1260- return obj
1261-
1262 def read(self, amt=None):
1263 self._fill_buffer(amt)
1264 if amt is None:
4fa42d3f 1265@@ -1582,9 +1890,21 @@ class PyCurlFileObject():
f79f24fe
MT
1266 self.opts.progress_obj.end(self._amount_read)
1267 self.fo.close()
1268
1269-
1270+ def geturl(self):
1271+ """ Provide the geturl() method, used to be got from
1272+ urllib.addinfourl, via. urllib.URLopener.* """
1273+ return self.url
1274+
1275 _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
1276
1277+def reset_curl_obj():
1278+ """To make sure curl has reread the network/dns info we force a reload"""
1279+ global _curl_cache
1280+ _curl_cache.close()
1281+ _curl_cache = pycurl.Curl()
1282+
4fa42d3f 1283+_libproxy_cache = None
f79f24fe
MT
1284+
1285
1286 #####################################################################
1287 # DEPRECATED FUNCTIONS
4fa42d3f
MT
1288@@ -1621,6 +1941,478 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
1289
1290
1291 #####################################################################
1292+# Serializer + parser: A replacement of the rather bulky Json code.
1293+#
1294+# - handles basic python literals, lists and tuples.
1295+# - serialized strings never contain ' ' or '\n'
1296+#
1297+#####################################################################
1298+
1299+_quoter_map = {}
1300+for c in '%[(,)] \n':
1301+ _quoter_map[c] = '%%%02x' % ord(c)
1302+del c
1303+
1304+def _dumps(v):
1305+ if v is None: return 'None'
1306+ if v is True: return 'True'
1307+ if v is False: return 'False'
1308+ if type(v) in (int, long, float):
1309+ return str(v)
1310+ if type(v) == unicode:
1311+ v = v.encode('UTF8')
1312+ if type(v) == str:
1313+ def quoter(c): return _quoter_map.get(c, c)
1314+ return "'%s'" % ''.join(map(quoter, v))
1315+ if type(v) == tuple:
1316+ return "(%s)" % ','.join(map(_dumps, v))
1317+ if type(v) == list:
1318+ return "[%s]" % ','.join(map(_dumps, v))
1319+ raise TypeError, 'Can\'t serialize %s' % v
1320+
1321+def _loads(s):
1322+ def decode(v):
1323+ if v == 'None': return None
1324+ if v == 'True': return True
1325+ if v == 'False': return False
1326+ try: return int(v)
1327+ except ValueError: pass
1328+ try: return float(v)
1329+ except ValueError: pass
1330+ if len(v) >= 2 and v[0] == v[-1] == "'":
1331+ ret = []; i = 1
1332+ while True:
1333+ j = v.find('%', i)
1334+ ret.append(v[i:j]) # skips the final "'"
1335+ if j == -1: break
1336+ ret.append(chr(int(v[j + 1:j + 3], 16)))
1337+ i = j + 3
1338+ v = ''.join(ret)
1339+ return v
1340+ stk = None
1341+ l = []
1342+ i = j = 0
1343+ while True:
1344+ if j == len(s) or s[j] in ',)]':
1345+ if j > i:
1346+ l.append(decode(s[i:j]))
1347+ if j == len(s): break
1348+ if s[j] in ')]':
1349+ if s[j] == ')':
1350+ l = tuple(l)
1351+ stk[0].append(l)
1352+ l, stk = stk
1353+ i = j = j + 1
1354+ elif s[j] in '[(':
1355+ stk = l, stk
1356+ l = []
1357+ i = j = j + 1
1358+ else:
1359+ j += 1 # safe because '[(,)]' are quoted
1360+ if stk: raise ValueError
1361+ if len(l) == 1: l = l[0]
1362+ return l
1363+
1364+
1365+#####################################################################
1366+# External downloader process
1367+#####################################################################
1368+
1369+def _readlines(fd):
1370+ buf = os.read(fd, 4096)
1371+ if not buf: return None
1372+ # whole lines only, no buffering
1373+ while buf[-1] != '\n':
1374+ buf += os.read(fd, 4096)
1375+ return buf[:-1].split('\n')
1376+
1377+import subprocess
1378+
1379+class _ExternalDownloader:
1380+ def __init__(self):
1381+ self.popen = subprocess.Popen(
1382+ '/usr/libexec/urlgrabber-ext-down',
1383+ stdin = subprocess.PIPE,
1384+ stdout = subprocess.PIPE,
1385+ )
1386+ self.stdin = self.popen.stdin.fileno()
1387+ self.stdout = self.popen.stdout.fileno()
1388+ self.running = {}
1389+ self.cnt = 0
1390+
1391+ # list of options we pass to downloader
1392+ _options = (
1393+ 'url', 'filename',
1394+ 'timeout', 'close_connection', 'keepalive',
1395+ 'throttle', 'bandwidth', 'range', 'reget',
1396+ 'user_agent', 'http_headers', 'ftp_headers',
1397+ 'proxy', 'prefix', 'username', 'password',
1398+ 'ssl_ca_cert',
1399+ 'ssl_cert', 'ssl_cert_type',
1400+ 'ssl_key', 'ssl_key_type',
1401+ 'ssl_key_pass',
1402+ 'ssl_verify_peer', 'ssl_verify_host',
1403+ 'size', 'max_header_size', 'ip_resolve',
1404+ )
1405+
1406+ def start(self, opts):
1407+ arg = []
1408+ for k in self._options:
1409+ v = getattr(opts, k)
1410+ if v is None: continue
1411+ arg.append('%s=%s' % (k, _dumps(v)))
1412+ if opts.progress_obj and opts.multi_progress_obj:
1413+ arg.append('progress_obj=True')
1414+ arg = ' '.join(arg)
1415+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
1416+
1417+ self.cnt += 1
1418+ self.running[self.cnt] = opts
1419+ os.write(self.stdin, arg +'\n')
1420+
1421+ def perform(self):
1422+ ret = []
1423+ lines = _readlines(self.stdout)
1424+ if not lines:
1425+ if DEBUG: DEBUG.info('downloader died')
1426+ raise KeyboardInterrupt
1427+ for line in lines:
1428+ # parse downloader output
1429+ line = line.split(' ', 5)
1430+ _id, size = map(int, line[:2])
1431+ if len(line) == 2:
1432+ self.running[_id]._progress.update(size)
1433+ continue
1434+ # job done
1435+ opts = self.running.pop(_id)
1436+ if line[4] == 'OK':
1437+ ug_err = None
1438+ if DEBUG: DEBUG.info('success')
1439+ else:
1440+ ug_err = URLGrabError(int(line[4]), line[5])
1441+ if DEBUG: DEBUG.info('failure: %s', ug_err)
1442+ _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async[0])
1443+ ret.append((opts, size, ug_err))
1444+ return ret
1445+
1446+ def abort(self):
1447+ self.popen.stdin.close()
1448+ self.popen.stdout.close()
1449+ self.popen.wait()
1450+
1451+class _ExternalDownloaderPool:
1452+ def __init__(self):
1453+ self.epoll = select.epoll()
1454+ self.running = {}
1455+ self.cache = {}
1456+
1457+ def start(self, opts):
1458+ host = urlparse.urlsplit(opts.url).netloc
1459+ dl = self.cache.pop(host, None)
1460+ if not dl:
1461+ dl = _ExternalDownloader()
1462+ fl = fcntl.fcntl(dl.stdin, fcntl.F_GETFD)
1463+ fcntl.fcntl(dl.stdin, fcntl.F_SETFD, fl | fcntl.FD_CLOEXEC)
1464+ self.epoll.register(dl.stdout, select.EPOLLIN)
1465+ self.running[dl.stdout] = dl
1466+ dl.start(opts)
1467+
1468+ def perform(self):
1469+ ret = []
1470+ for fd, event in self.epoll.poll():
1471+ if event & select.EPOLLHUP:
1472+ if DEBUG: DEBUG.info('downloader died')
1473+ raise KeyboardInterrupt
1474+ assert event & select.EPOLLIN
1475+ done = self.running[fd].perform()
1476+ if not done: continue
1477+ assert len(done) == 1
1478+ ret.extend(done)
1479+
1480+ # dl finished, move it to the cache
1481+ host = urlparse.urlsplit(done[0][0].url).netloc
1482+ if host in self.cache: self.cache[host].abort()
1483+ self.epoll.unregister(fd)
1484+ self.cache[host] = self.running.pop(fd)
1485+ return ret
1486+
1487+ def abort(self):
1488+ for dl in self.running.values():
1489+ self.epoll.unregister(dl.stdout)
1490+ dl.abort()
1491+ for dl in self.cache.values():
1492+ dl.abort()
1493+
1494+
1495+#####################################################################
1496+# High level async API
1497+#####################################################################
1498+
1499+_async_queue = []
1500+
1501+def parallel_wait(meter=None):
1502+ '''Process queued requests in parallel.
1503+ '''
1504+
1505+ # calculate total sizes
1506+ meters = {}
1507+ for opts in _async_queue:
1508+ if opts.progress_obj and opts.multi_progress_obj:
1509+ count, total = meters.get(opts.multi_progress_obj) or (0, 0)
1510+ meters[opts.multi_progress_obj] = count + 1, total + opts.size
1511+
1512+ # start multi-file meters
1513+ for meter in meters:
1514+ count, total = meters[meter]
1515+ meter.start(count, total)
1516+
1517+ dl = _ExternalDownloaderPool()
1518+ host_con = {} # current host connection counts
1519+
1520+ def start(opts, tries):
1521+ opts.tries = tries
1522+ try:
1523+ dl.start(opts)
1524+ except OSError, e:
1525+ # can't spawn downloader, give up immediately
1526+ opts.exception = URLGrabError(5, exception2msg(e))
1527+ _run_callback(opts.failfunc, opts)
1528+ return
1529+
1530+ key, limit = opts.async
1531+ host_con[key] = host_con.get(key, 0) + 1
1532+ if opts.progress_obj:
1533+ if opts.multi_progress_obj:
1534+ opts._progress = opts.multi_progress_obj.newMeter()
1535+ opts._progress.start(text=opts.text)
1536+ else:
1537+ opts._progress = time.time() # no updates
1538+
1539+ def perform():
1540+ for opts, size, ug_err in dl.perform():
1541+ key, limit = opts.async
1542+ host_con[key] -= 1
1543+
1544+ if ug_err is None:
1545+ if opts.checkfunc:
1546+ try: _run_callback(opts.checkfunc, opts)
1547+ except URLGrabError, ug_err: pass
1548+
1549+ if opts.progress_obj:
1550+ if opts.multi_progress_obj:
1551+ if ug_err:
1552+ opts._progress.failure(None)
1553+ else:
1554+ opts.multi_progress_obj.re.total += size - opts.size # correct totals
1555+ opts._progress.end(size)
1556+ opts.multi_progress_obj.removeMeter(opts._progress)
1557+ else:
1558+ opts.progress_obj.start(text=opts.text, now=opts._progress)
1559+ opts.progress_obj.update(size)
1560+ opts.progress_obj.end(size)
1561+ del opts._progress
1562+
1563+ if ug_err is None:
1564+ continue
1565+
1566+ retry = opts.retry or 0
1567+ if opts.failure_callback:
1568+ opts.exception = ug_err
1569+ try: _run_callback(opts.failure_callback, opts)
1570+ except URLGrabError, ug_err:
1571+ retry = 0 # no retries
1572+ if opts.tries < retry and ug_err.errno in opts.retrycodes:
1573+ start(opts, opts.tries + 1) # simple retry
1574+ continue
1575+
1576+ if opts.mirror_group:
1577+ mg, errors, failed, removed = opts.mirror_group
1578+ errors.append((opts.url, exception2msg(ug_err)))
1579+ failed[key] = failed.get(key, 0) + 1
1580+ opts.mirror = key
1581+ opts.exception = ug_err
1582+ action = mg.default_action or {}
1583+ if mg.failure_callback:
1584+ opts.tries = len(errors)
1585+ action = dict(action) # update only the copy
1586+ action.update(_run_callback(mg.failure_callback, opts))
1587+ if not action.get('fail', 0):
1588+ # mask this mirror and retry
1589+ if action.get('remove', 1):
1590+ removed.add(key)
1591+ _async_queue.append(opts)
1592+ continue
1593+ # fail=1 from callback
1594+ ug_err.errors = errors
1595+
1596+ # urlgrab failed
1597+ opts.exception = ug_err
1598+ _run_callback(opts.failfunc, opts)
1599+
1600+ try:
1601+ idx = 0
1602+ while True:
1603+ if idx >= len(_async_queue):
1604+ # the queue is empty
1605+ if not dl.running: break
1606+ # pending dl may extend it
1607+ perform()
1608+ continue
1609+
1610+ # handle next request
1611+ opts = _async_queue[idx]
1612+ idx += 1
1613+
1614+ # check global limit
1615+ while len(dl.running) >= default_grabber.opts.max_connections:
1616+ perform()
1617+ if DEBUG:
1618+ DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections)
1619+
1620+ if opts.mirror_group:
1621+ mg, errors, failed, removed = opts.mirror_group
1622+
1623+ # find the best mirror
1624+ best = None
1625+ best_speed = None
1626+ for mirror in mg.mirrors:
1627+ key = mirror['mirror']
1628+ if key in removed: continue
1629+
1630+ # estimate mirror speed
1631+ speed, fail = _TH.estimate(key)
1632+ speed /= 1 + host_con.get(key, 0)
1633+
1634+ # order by: least failures, private flag, best speed
1635+ # ignore 'private' flag if there were failures
1636+ private = not fail and mirror.get('kwargs', {}).get('private', False)
1637+ speed = -failed.get(key, 0), private, speed
1638+ if best is None or speed > best_speed:
1639+ best = mirror
1640+ best_speed = speed
1641+
1642+ if best is None:
1643+ opts.exception = URLGrabError(256, _('No more mirrors to try.'))
1644+ opts.exception.errors = errors
1645+ _run_callback(opts.failfunc, opts)
1646+ continue
1647+
1648+ # update the grabber object, apply mirror kwargs
1649+ grabber = best.get('grabber') or mg.grabber
1650+ opts.delegate = grabber.opts.derive(**best.get('kwargs', {}))
1651+
1652+ # update the current mirror and limit
1653+ key = best['mirror']
1654+ limit = best.get('kwargs', {}).get('max_connections', 2)
1655+ opts.async = key, limit
1656+
1657+ # update URL and proxy
1658+ url = mg._join_url(key, opts.relative_url)
1659+ url, parts = opts.urlparser.parse(url, opts)
1660+ opts.find_proxy(url, parts[0])
1661+ opts.url = url
1662+
1663+ # check host limit, then start
1664+ key, limit = opts.async
1665+ while host_con.get(key, 0) >= limit:
1666+ perform()
1667+ if DEBUG:
1668+ DEBUG.info('max_connections(%s): %d/%d', key, host_con.get(key, 0), limit)
1669+
1670+ start(opts, 1)
1671+ except IOError, e:
1672+ if e.errno != 4: raise
1673+ raise KeyboardInterrupt
1674+
1675+ finally:
1676+ dl.abort()
1677+ for meter in meters:
1678+ meter.end()
1679+ del _async_queue[:]
1680+ _TH.save()
1681+
1682+
1683+#####################################################################
1684+# Host bandwidth estimation
1685+#####################################################################
1686+
1687+class _TH:
1688+ hosts = {}
1689+ dirty = None
1690+
1691+ @staticmethod
1692+ def load():
1693+ filename = default_grabber.opts.timedhosts
1694+ if filename and _TH.dirty is None:
1695+ try:
1696+ for line in open(filename):
1697+ host, speed, fail, ts = line.rsplit(' ', 3)
1698+ _TH.hosts[host] = int(speed), int(fail), int(ts)
1699+ except IOError: pass
1700+ _TH.dirty = False
1701+
1702+ @staticmethod
1703+ def save():
1704+ filename = default_grabber.opts.timedhosts
1705+ if filename and _TH.dirty is True:
1706+ tmp = '%s.%d' % (filename, os.getpid())
1707+ try:
1708+ f = open(tmp, 'w')
1709+ for host in _TH.hosts:
1710+ f.write(host + ' %d %d %d\n' % _TH.hosts[host])
1711+ f.close()
1712+ os.rename(tmp, filename)
1713+ except IOError: pass
1714+ _TH.dirty = False
1715+
1716+ @staticmethod
1717+ def update(url, dl_size, dl_time, ug_err, baseurl=None):
1718+ # Use hostname from URL. If it's a file:// URL, use baseurl.
1719+ # If no baseurl, do not update timedhosts.
1720+ host = urlparse.urlsplit(url).netloc.split('@')[-1] or baseurl
1721+ if not host: return
1722+
1723+ _TH.load()
1724+ speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0)
1725+ now = time.time()
1726+
1727+ if ug_err is None:
1728+ # defer first update if the file was small. BZ 851178.
1729+ if not ts and dl_size < 1e6: return
1730+
1731+ # k1: the older, the less useful
1732+ # k2: <500ms readings are less reliable
1733+ # speeds vary, use 10:1 smoothing
1734+ k1 = 2**((ts - now) / default_grabber.opts.half_life)
1735+ k2 = min(dl_time / .500, 1.0) / 10
1736+ if k2 > 0:
1737+ speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2)
1738+ fail = 0
1739+ elif getattr(ug_err, 'code', None) == 404:
1740+ fail = 0 # alive, at least
1741+ else:
1742+ fail += 1 # seems dead
1743+
1744+ _TH.hosts[host] = speed, fail, now
1745+ _TH.dirty = True
1746+
1747+ @staticmethod
1748+ def estimate(baseurl):
1749+ _TH.load()
1750+
1751+ # Use just the hostname, unless it's a file:// baseurl.
1752+ host = urlparse.urlsplit(baseurl).netloc.split('@')[-1] or baseurl
1753+
1754+ default_speed = default_grabber.opts.default_speed
1755+ try: speed, fail, ts = _TH.hosts[host]
1756+ except KeyError: return default_speed, 0
1757+
1758+ speed *= 2**-fail
1759+ k = 2**((ts - time.time()) / default_grabber.opts.half_life)
1760+ speed = k * speed + (1 - k) * default_speed
1761+ return speed, fail
1762+
1763+#####################################################################
1764 # TESTING
1765 def _main_test():
1766 try: url, filename = sys.argv[1:3]
f79f24fe 1767diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
4fa42d3f 1768index dad410b..7975f1b 100644
f79f24fe
MT
1769--- a/urlgrabber/mirror.py
1770+++ b/urlgrabber/mirror.py
4fa42d3f
MT
1771@@ -76,6 +76,9 @@ CUSTOMIZATION
1772 'grabber' is omitted, the default grabber will be used. If
1773 kwargs are omitted, then (duh) they will not be used.
1774
1775+ kwarg 'max_connections' limits the number of concurrent
1776+ connections to this mirror.
1777+
1778 3) Pass keyword arguments when instantiating the mirror group.
1779 See, for example, the failure_callback argument.
1780
1781@@ -87,10 +90,14 @@ CUSTOMIZATION
1782 """
1783
1784
1785+import sys
f79f24fe
MT
1786 import random
1787 import thread # needed for locking to make this threadsafe
1788
1789-from grabber import URLGrabError, CallbackObject, DEBUG
1790+from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
4fa42d3f
MT
1791+from grabber import _run_callback, _do_raise
1792+from grabber import exception2msg
1793+from grabber import _TH
f79f24fe
MT
1794
1795 def _(st):
1796 return st
4fa42d3f
MT
1797@@ -126,7 +133,9 @@ class MirrorGroup:
1798 files)
1799
1800 * if the local list is ever exhausted, a URLGrabError will be
1801- raised (errno=256, no more mirrors)
1802+ raised (errno=256, No more mirrors). The 'errors' attribute
1803+ holds a list of (full_url, errmsg) tuples. This contains
1804+ all URLs tried and the corresponding error messages.
1805
1806 OPTIONS
1807
1808@@ -153,7 +162,8 @@ class MirrorGroup:
1809
1810 The 'fail' option will cause immediate failure by re-raising
1811 the exception and no further attempts to get the current
1812- download.
1813+ download. As in the "No more mirrors" case, the 'errors'
1814+ attribute is set in the exception object.
1815
1816 This dict can be set at instantiation time,
1817 mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
1818@@ -184,6 +194,7 @@ class MirrorGroup:
1819
1820 obj.exception = < exception that was raised >
1821 obj.mirror = < the mirror that was tried >
1822+ obj.tries = < the number of mirror tries so far >
1823 obj.relative_url = < url relative to the mirror >
1824 obj.url = < full url that failed >
1825 # .url is just the combination of .mirror
1826@@ -251,6 +262,17 @@ class MirrorGroup:
1827 self.default_action = None
1828 self._process_kwargs(kwargs)
1829
1830+ # use the same algorithm as parallel downloader to initially sort
1831+ # the mirror list (sort by speed, but prefer live private mirrors)
1832+ def estimate(m):
1833+ speed, fail = _TH.estimate(m['mirror'])
1834+ private = not fail and m.get('kwargs', {}).get('private', False)
1835+ return private, speed
1836+
1837+ # update the initial order. since sorting is stable, the relative
1838+ # order of unknown (not used yet) hosts is retained.
1839+ self.mirrors.sort(key=estimate, reverse=True)
1840+
1841 # if these values are found in **kwargs passed to one of the urlXXX
1842 # methods, they will be stripped before getting passed on to the
1843 # grabber
1844@@ -263,7 +285,8 @@ class MirrorGroup:
f79f24fe
MT
1845 def _parse_mirrors(self, mirrors):
1846 parsed_mirrors = []
1847 for m in mirrors:
1848- if type(m) == type(''): m = {'mirror': m}
1849+ if isinstance(m, basestring):
1850+ m = {'mirror': _to_utf8(m)}
1851 parsed_mirrors.append(m)
1852 return parsed_mirrors
1853
4fa42d3f
MT
1854@@ -280,7 +303,9 @@ class MirrorGroup:
1855 # return a random mirror so that multiple mirrors get used
1856 # even without failures.
1857 if not gr.mirrors:
1858- raise URLGrabError(256, _('No more mirrors to try.'))
1859+ e = URLGrabError(256, _('No more mirrors to try.'))
1860+ e.errors = gr.errors
1861+ raise e
1862 return gr.mirrors[gr._next]
1863
1864 def _failure(self, gr, cb_obj):
1865@@ -307,7 +332,9 @@ class MirrorGroup:
1866 a.update(action)
1867 action = a
1868 self.increment_mirror(gr, action)
1869- if action and action.get('fail', 0): raise
1870+ if action and action.get('fail', 0):
1871+ sys.exc_info()[1].errors = gr.errors
1872+ raise
1873
1874 def increment_mirror(self, gr, action={}):
1875 """Tell the mirror object increment the mirror index
1876@@ -377,35 +404,50 @@ class MirrorGroup:
1877 gr.url = url
1878 gr.kw = dict(kw)
1879 self._load_gr(gr)
1880+ gr.errors = []
1881
1882 for k in self.options:
1883 try: del kw[k]
1884 except KeyError: pass
1885
1886+ tries = 0
1887 while 1:
1888+ tries += 1
1889 mirrorchoice = self._get_mirror(gr)
1890 fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
1891- kwargs = dict(mirrorchoice.get('kwargs', {}))
1892- kwargs.update(kw)
1893 grabber = mirrorchoice.get('grabber') or self.grabber
1894+ # apply mirrorchoice kwargs on top of grabber.opts
1895+ opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {}))
1896 func_ref = getattr(grabber, func)
1897 if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
1898 try:
1899- return func_ref( *(fullurl,), **kwargs )
1900+ return func_ref( *(fullurl,), opts=opts, **kw )
1901 except URLGrabError, e:
1902 if DEBUG: DEBUG.info('MIRROR: failed')
1903+ gr.errors.append((fullurl, exception2msg(e)))
1904 obj = CallbackObject()
1905 obj.exception = e
1906 obj.mirror = mirrorchoice['mirror']
1907 obj.relative_url = gr.url
1908 obj.url = fullurl
1909+ obj.tries = tries
1910 self._failure(gr, obj)
1911
1912 def urlgrab(self, url, filename=None, **kwargs):
1913 kw = dict(kwargs)
1914 kw['filename'] = filename
1915+ if kw.get('async'):
1916+ # enable mirror failovers in async path
1917+ kw['mirror_group'] = self, [], {}, set()
1918+ kw['relative_url'] = url
1919+ else:
1920+ kw.pop('failfunc', None)
1921 func = 'urlgrab'
1922- return self._mirror_try(func, url, kw)
1923+ try:
1924+ return self._mirror_try(func, url, kw)
1925+ except URLGrabError, e:
1926+ obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs)
1927+ return _run_callback(kwargs.get('failfunc', _do_raise), obj)
1928
1929 def urlopen(self, url, **kwargs):
1930 kw = dict(kwargs)
f79f24fe 1931diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
4fa42d3f 1932index dd07c6a..077fd99 100644
f79f24fe
MT
1933--- a/urlgrabber/progress.py
1934+++ b/urlgrabber/progress.py
4fa42d3f
MT
1935@@ -133,8 +133,8 @@ class BaseMeter:
1936 # for a real gui, you probably want to override and put a call
1937 # to your mainloop iteration function here
1938 if now is None: now = time.time()
1939- if (now >= self.last_update_time + self.update_period) or \
1940- not self.last_update_time:
1941+ if (not self.last_update_time or
1942+ (now >= self.last_update_time + self.update_period)):
1943 self.re.update(amount_read, now)
1944 self.last_amount_read = amount_read
1945 self.last_update_time = now
1946@@ -211,6 +211,21 @@ def text_meter_total_size(size, downloaded=0):
1947 # 4. + ( 5, total: 32)
1948 #
1949
1950+def _term_add_bar(tl, bar_max_length, pc):
1951+ blen = bar_max_length
1952+ bar = '='*int(blen * pc)
1953+ if (blen * pc) - int(blen * pc) >= 0.5:
1954+ bar += '-'
1955+ return tl.add(' [%-*.*s]' % (blen, blen, bar))
1956+
1957+def _term_add_end(tl, osize, size):
1958+ if osize is not None:
1959+ if size > osize: # Is ??? better? Really need something to say < vs >.
1960+ return tl.add(' !!! '), True
1961+ elif size != osize:
1962+ return tl.add(' ... '), True
1963+ return tl.add(' ' * 5), False
1964+
1965 class TextMeter(BaseMeter):
1966 def __init__(self, fo=sys.stderr):
1967 BaseMeter.__init__(self)
1968@@ -218,7 +233,6 @@ class TextMeter(BaseMeter):
1969
1970 def _do_update(self, amount_read, now=None):
1971 etime = self.re.elapsed_time()
1972- fetime = format_time(etime)
1973 fread = format_number(amount_read)
1974 #self.size = None
1975 if self.text is not None:
1976@@ -234,16 +248,20 @@ class TextMeter(BaseMeter):
1977
1978 # Include text + ui_rate in minimal
1979 tl = TerminalLine(8, 8+1+8)
1980+ if tl._llen > 80:
1981+ use_hours = True # For big screens, make it more readable.
1982+ else:
1983+ use_hours = False
1984 ui_size = tl.add(' | %5sB' % fread)
1985 if self.size is None:
1986- ui_time = tl.add(' %9s' % fetime)
1987+ ui_time = tl.add(' %9s' % format_time(etime, use_hours))
1988 ui_end = tl.add(' ' * 5)
1989 ui_rate = tl.add(' %5sB/s' % ave_dl)
1990 out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
1991 ui_rate, ui_size, ui_time, ui_end)
1992 else:
1993 rtime = self.re.remaining_time()
1994- frtime = format_time(rtime)
1995+ frtime = format_time(rtime, use_hours)
1996 frac = self.re.fraction_read()
1997
1998 ui_time = tl.add(' %9s' % frtime)
1999@@ -259,13 +277,10 @@ class TextMeter(BaseMeter):
2000 ui_rate = tl.add(' %5sB/s' % ave_dl)
2001 # Make text grow a bit before we start growing the bar too
2002 blen = 4 + tl.rest_split(8 + 8 + 4)
2003- bar = '='*int(blen * frac)
2004- if (blen * frac) - int(blen * frac) >= 0.5:
2005- bar += '-'
2006- ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar))
2007- out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
2008- ui_sofar_pc, ui_pc, ui_bar,
2009- ui_rate, ui_size, ui_time, ui_end)
2010+ ui_bar = _term_add_bar(tl, blen, frac)
2011+ out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
2012+ ui_sofar_pc, ui_pc, ui_bar,
2013+ ui_rate,ui_size,ui_time, ui_end)
2014
2015 self.fo.write(out)
2016 self.fo.flush()
2017@@ -274,7 +289,6 @@ class TextMeter(BaseMeter):
2018 global _text_meter_total_size
2019 global _text_meter_sofar_size
2020
2021- total_time = format_time(self.re.elapsed_time())
2022 total_size = format_number(amount_read)
2023 if self.text is not None:
2024 text = self.text
2025@@ -282,14 +296,13 @@ class TextMeter(BaseMeter):
2026 text = self.basename
2027
2028 tl = TerminalLine(8)
2029- ui_size = tl.add(' | %5sB' % total_size)
2030- ui_time = tl.add(' %9s' % total_time)
2031- not_done = self.size is not None and amount_read != self.size
2032- if not_done:
2033- ui_end = tl.add(' ... ')
2034+ if tl._llen > 80:
2035+ use_hours = True # For big screens, make it more readable.
2036 else:
2037- ui_end = tl.add(' ' * 5)
2038-
2039+ use_hours = False
2040+ ui_size = tl.add(' | %5sB' % total_size)
2041+ ui_time = tl.add(' %9s' % format_time(self.re.elapsed_time(),use_hours))
2042+ ui_end, not_done = _term_add_end(tl, self.size, amount_read)
2043 out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
2044 ui_size, ui_time, ui_end)
2045 self.fo.write(out)
2046@@ -331,12 +344,21 @@ class MultiFileHelper(BaseMeter):
2047 def message(self, message):
2048 self.master.message_meter(self, message)
2049
2050+class _FakeLock:
2051+ def acquire(self):
2052+ pass
2053+ def release(self):
2054+ pass
2055+
2056 class MultiFileMeter:
2057 helperclass = MultiFileHelper
2058- def __init__(self):
2059+ def __init__(self, threaded=True):
2060 self.meters = []
2061 self.in_progress_meters = []
2062- self._lock = thread.allocate_lock()
2063+ if threaded:
2064+ self._lock = thread.allocate_lock()
2065+ else:
2066+ self._lock = _FakeLock()
2067 self.update_period = 0.3 # seconds
2068
2069 self.numfiles = None
2070@@ -369,6 +391,7 @@ class MultiFileMeter:
2071
2072 def end(self, now=None):
2073 if now is None: now = time.time()
2074+ self.re.update(self._amount_read(), now)
2075 self._do_end(now)
2076
2077 def _do_end(self, now):
2078@@ -407,8 +430,8 @@ class MultiFileMeter:
2079 def update_meter(self, meter, now):
2080 if not meter in self.meters:
2081 raise ValueError('attempt to use orphaned meter')
2082- if (now >= self.last_update_time + self.update_period) or \
2083- not self.last_update_time:
2084+ if (not self.last_update_time or
2085+ (now >= self.last_update_time + self.update_period)):
2086 self.re.update(self._amount_read(), now)
2087 self.last_update_time = now
2088 self._do_update_meter(meter, now)
2089@@ -466,34 +489,87 @@ class MultiFileMeter:
2090
2091
2092 class TextMultiFileMeter(MultiFileMeter):
2093- def __init__(self, fo=sys.stderr):
2094+ def __init__(self, fo=sys.stderr, threaded=True):
2095 self.fo = fo
2096- MultiFileMeter.__init__(self)
2097+ MultiFileMeter.__init__(self, threaded)
2098+ self.index_time = self.index = 0
2099
2100 # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
2101+# New output, like TextMeter output...
2102+# update: No size (minimal: 17 chars)
2103+# -----------------------------------
2104+# (<#file>/<#tot files>): <text> <rate> | <current size> <elapsed>
2105+# 8-48 1 8 3 6 1 7-9 5
2106+#
2107+# update: Size, All files
2108+# -----------------------
2109+# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA
2110+# 8-22 1 3-4 1 6-12 1 8 3 6 1 7-9 1 3 1
2111+# end
2112+# ---
2113+# <text> | <file size> <file elapsed time>
2114+# 8-56 3 6 1 9 5
2115 def _do_update_meter(self, meter, now):
2116 self._lock.acquire()
2117 try:
2118- format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \
2119- "time: %8.8s/%8.8s"
2120 df = self.finished_files
2121 tf = self.numfiles or 1
2122- pf = 100 * float(df)/tf + 0.49
2123+ # Don't use "percent of files complete" ...
2124+ # pf = 100 * float(df)/tf + 0.49
2125 dd = self.re.last_amount_read
2126- td = self.total_size
2127+ td = self.re.total
2128 pd = 100 * (self.re.fraction_read() or 0) + 0.49
2129 dt = self.re.elapsed_time()
2130 rt = self.re.remaining_time()
2131- if rt is None: tt = None
2132- else: tt = dt + rt
2133
2134- fdd = format_number(dd) + 'B'
2135- ftd = format_number(td) + 'B'
2136- fdt = format_time(dt, 1)
2137- ftt = format_time(tt, 1)
2138-
2139- out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
2140- self.fo.write('\r' + out)
2141+ frac = self.re.fraction_read() or 0
2142+ pf = 100 * frac
2143+ ave_dl = format_number(self.re.average_rate())
2144+
2145+ # cycle through active meters
2146+ if now > self.index_time:
2147+ self.index_time = now + 1.0
2148+ self.index += 1
2149+ if self.index >= len(self.meters):
2150+ self.index = 0
2151+ meter = self.meters[self.index]
2152+ text = meter.text or meter.basename
2153+ if tf > 1:
2154+ text = '(%u/%u): %s' % (df+1+self.index, tf, text)
2155+
2156+ # Include text + ui_rate in minimal
2157+ tl = TerminalLine(8, 8+1+8)
2158+ if tl._llen > 80:
2159+ use_hours = True # For big screens, make it more readable.
2160+ time_len = 9
2161+ else:
2162+ use_hours = False
2163+ time_len = 7
2164+
2165+ ui_size = tl.add(' | %5sB' % format_number(dd))
2166+
2167+ if not self.re.total:
2168+ ui_time = tl.add(' %*s' % (time_len,format_time(dt, use_hours)))
2169+ ui_end = tl.add(' ' * 5)
2170+ ui_rate = tl.add(' %5sB/s' % ave_dl)
2171+ out = '\r%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
2172+ ui_rate, ui_size, ui_time, ui_end)
2173+ else:
2174+ ui_time = tl.add(' %*s' % (time_len,format_time(rt, use_hours)))
2175+ ui_end = tl.add(' ETA ')
2176+
2177+ ui_sofar_pc = tl.add(' %i%%' % pf,
2178+ full_len=len(" (100%)"))
2179+ ui_rate = tl.add(' %5sB/s' % ave_dl)
2180+
2181+ # Make text grow a bit before we start growing the bar too
2182+ blen = 4 + tl.rest_split(8 + 8 + 4)
2183+ ui_bar = _term_add_bar(tl, blen, frac)
2184+ out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
2185+ ui_sofar_pc, ui_bar,
2186+ ui_rate, ui_size, ui_time,
2187+ ui_end)
2188+ self.fo.write(out)
2189 self.fo.flush()
2190 finally:
2191 self._lock.release()
2192@@ -502,24 +578,40 @@ class TextMultiFileMeter(MultiFileMeter):
2193 self._lock.acquire()
2194 try:
2195 format = "%-30.30s %6.6s %8.8s %9.9s"
2196- fn = meter.basename
2197+ fn = meter.text or meter.basename
2198 size = meter.last_amount_read
2199 fsize = format_number(size) + 'B'
2200 et = meter.re.elapsed_time()
2201- fet = format_time(et, 1)
2202- frate = format_number(size / et) + 'B/s'
2203-
2204- out = '%-79.79s' % (format % (fn, fsize, fet, frate))
2205- self.fo.write('\r' + out + '\n')
2206+ frate = format_number(et and size / et) + 'B/s'
2207+ df = self.finished_files
2208+ tf = self.numfiles or 1
2209+
2210+ total_size = format_number(size)
2211+ text = meter.text or meter.basename
2212+ if tf > 1:
2213+ text = '(%u/%u): %s' % (df, tf, text)
2214+
2215+ tl = TerminalLine(8)
2216+ if tl._llen > 80:
2217+ use_hours = True # For big screens, make it more readable.
2218+ time_len = 9
2219+ else:
2220+ use_hours = False
2221+ time_len = 7
2222+ ui_size = tl.add(' | %5sB' % total_size)
2223+ ui_time = tl.add(' %*s' % (time_len, format_time(et, use_hours)))
2224+ ui_end, not_done = _term_add_end(tl, meter.size, size)
2225+ out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
2226+ ui_size, ui_time, ui_end)
2227+ self.fo.write(out)
2228 finally:
2229 self._lock.release()
2230- self._do_update_meter(meter, now)
2231
2232 def _do_failure_meter(self, meter, message, now):
2233 self._lock.acquire()
2234 try:
2235 format = "%-30.30s %6.6s %s"
2236- fn = meter.basename
2237+ fn = meter.text or meter.basename
2238 if type(message) in (type(''), type(u'')):
2239 message = message.splitlines()
2240 if not message: message = ['']
2241@@ -536,15 +628,6 @@ class TextMultiFileMeter(MultiFileMeter):
2242 pass
2243 finally:
2244 self._lock.release()
2245-
2246- def _do_end(self, now):
2247- self._do_update_meter(None, now)
2248- self._lock.acquire()
2249- try:
2250- self.fo.write('\n')
2251- self.fo.flush()
2252- finally:
2253- self._lock.release()
2254
2255 ######################################################################
2256 # support classes and functions
2257@@ -658,6 +741,8 @@ def format_time(seconds, use_hours=0):
f79f24fe
MT
2258 if seconds is None or seconds < 0:
2259 if use_hours: return '--:--:--'
2260 else: return '--:--'
2261+ elif seconds == float('inf'):
2262+ return 'Infinite'
2263 else:
2264 seconds = int(seconds)
2265 minutes = seconds / 60
4fa42d3f
MT
2266@@ -722,9 +807,77 @@ def _tst(fn, cur, tot, beg, size, *args):
2267 time.sleep(delay)
2268 tm.end(size)
2269
2270+def _mtst(datas, *args):
2271+ print '-' * 79
2272+ tm = TextMultiFileMeter(threaded=False)
2273+
2274+ dl_sizes = {}
2275+
2276+ num = 0
2277+ total_size = 0
2278+ dl_total_size = 0
2279+ for data in datas:
2280+ dl_size = None
2281+ if len(data) == 2:
2282+ fn, size = data
2283+ dl_size = size
2284+ if len(data) == 3:
2285+ fn, size, dl_size = data
2286+ nm = tm.newMeter()
2287+ nm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size,
2288+ text=fn)
2289+ num += 1
2290+ assert dl_size is not None
2291+ dl_total_size += dl_size
2292+ dl_sizes[nm] = dl_size
2293+ if size is None or total_size is None:
2294+ total_size = None
2295+ else:
2296+ total_size += size
2297+ tm.start(num, total_size)
2298+
2299+ num = 0
2300+ off = 0
2301+ for (inc, delay) in args:
2302+ off += 1
2303+ while num < ((dl_total_size * off) / len(args)):
2304+ num += inc
2305+ for nm in tm.meters[:]:
2306+ if dl_sizes[nm] <= num:
2307+ nm.end(dl_sizes[nm])
2308+ tm.removeMeter(nm)
2309+ else:
2310+ nm.update(num)
2311+ time.sleep(delay)
2312+ assert not tm.meters
2313+
2314 if __name__ == "__main__":
2315 # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28
2316 # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08
2317+ if len(sys.argv) >= 2 and sys.argv[1] == 'multi':
2318+ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
2319+ ("s-1.0.1-1.fc8.i386.rpm", 5000),
2320+ ("m-1.0.1-2.fc8.i386.rpm", 10000)),
2321+ (100, 0.33), (500, 0.25), (1000, 0.1))
2322+
2323+ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
2324+ ("s-1.0.1-1.fc8.i386.rpm", 5000),
2325+ ("m-1.0.1-2.fc8.i386.rpm", None, 10000)),
2326+ (100, 0.33), (500, 0.25), (1000, 0.1))
2327+
2328+ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
2329+ ("s-1.0.1-1.fc8.i386.rpm", 2500000),
2330+ ("m-1.0.1-2.fc8.i386.rpm", 10000)),
2331+ (10, 0.2), (50, 0.1), (1000, 0.1))
2332+
2333+ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
2334+ ("s-1.0.1-1.fc8.i386.rpm", None, 2500000),
2335+ ("m-1.0.1-2.fc8.i386.rpm", None, 10000)),
2336+ (10, 0.2), (50, 0.1), (1000, 0.1))
2337+ # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
2338+ # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
2339+ sys.exit(0)
2340+
2341 if len(sys.argv) >= 2 and sys.argv[1] == 'total':
2342 text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 +
2343 1000000 + 10000 + 10000 + 10000 + 1000000)