+diff --git a/.gitignore b/.gitignore
+new file mode 100644
+index 0000000..1ffe416
+--- /dev/null
++++ b/.gitignore
+@@ -0,0 +1,7 @@
++*.py[co]
++MANIFEST
++dist
++build
++*.kdev*
++*.kateproject
++ipython.log*
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
index 518e512..09cd896 100644
--- a/scripts/urlgrabber
def help_doc(self):
print __doc__
+diff --git a/scripts/urlgrabber-ext-down b/scripts/urlgrabber-ext-down
+new file mode 100755
+index 0000000..3dafb12
+--- /dev/null
++++ b/scripts/urlgrabber-ext-down
+@@ -0,0 +1,75 @@
++#! /usr/bin/python
++# A very simple external downloader
++# Copyright 2011-2012 Zdenek Pavlas
++
++# This library is free software; you can redistribute it and/or
++# modify it under the terms of the GNU Lesser General Public
++# License as published by the Free Software Foundation; either
++# version 2.1 of the License, or (at your option) any later version.
++#
++# This library is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++# Lesser General Public License for more details.
++#
++# You should have received a copy of the GNU Lesser General Public
++# License along with this library; if not, write to the
++# Free Software Foundation, Inc.,
++# 59 Temple Place, Suite 330,
++# Boston, MA 02111-1307 USA
++
++import time, os, errno, sys
++from urlgrabber.grabber import \
++ _readlines, URLGrabberOptions, _loads, \
++ PyCurlFileObject, URLGrabError
++
++def write(fmt, *arg):
++ try: os.write(1, fmt % arg)
++ except OSError, e:
++ if e.args[0] != errno.EPIPE: raise
++ sys.exit(1)
++
++class ProxyProgress:
++ def start(self, *d1, **d2):
++ self.next_update = 0
++ def update(self, _amount_read):
++ t = time.time()
++ if t < self.next_update: return
++ self.next_update = t + 0.31
++ write('%d %d\n', self._id, _amount_read)
++
++def main():
++ import signal
++ signal.signal(signal.SIGINT, lambda n, f: sys.exit(1))
++ cnt = 0
++ while True:
++ lines = _readlines(0)
++ if not lines: break
++ for line in lines:
++ cnt += 1
++ opts = URLGrabberOptions()
++ opts._id = cnt
++ for k in line.split(' '):
++ k, v = k.split('=', 1)
++ setattr(opts, k, _loads(v))
++ if opts.progress_obj:
++ opts.progress_obj = ProxyProgress()
++ opts.progress_obj._id = cnt
++
++ dlsz = dltm = 0
++ try:
++ fo = PyCurlFileObject(opts.url, opts.filename, opts)
++ fo._do_grab()
++ fo.fo.close()
++ size = fo._amount_read
++ if fo._tm_last:
++ dlsz = fo._tm_last[0] - fo._tm_first[0]
++ dltm = fo._tm_last[1] - fo._tm_first[1]
++ ug_err = 'OK'
++ except URLGrabError, e:
++ size = 0
++ ug_err = '%d %s' % e.args
++ write('%d %d %d %.3f %s\n', opts._id, size, dlsz, dltm, ug_err)
++
++if __name__ == '__main__':
++ main()
+diff --git a/setup.py b/setup.py
+index d0b87b8..bfa4a18 100644
+--- a/setup.py
++++ b/setup.py
+@@ -15,8 +15,10 @@ url = _urlgrabber.__url__
+ packages = ['urlgrabber']
+ package_dir = {'urlgrabber':'urlgrabber'}
+ scripts = ['scripts/urlgrabber']
+-data_files = [('share/doc/' + name + '-' + version,
+- ['README','LICENSE', 'TODO', 'ChangeLog'])]
++data_files = [
++ ('share/doc/' + name + '-' + version, ['README','LICENSE', 'TODO', 'ChangeLog']),
++ ('libexec', ['scripts/urlgrabber-ext-down']),
++]
+ options = { 'clean' : { 'all' : 1 } }
+ classifiers = [
+ 'Development Status :: 4 - Beta',
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 50c6348..5fb43f9 100644
--- a/test/base_test_code.py
return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
-index e090e90..b2770c5 100644
+index e090e90..6ce9861 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
-@@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
+@@ -49,11 +49,26 @@ GENERAL ARGUMENTS (kwargs)
+ progress_obj = None
+
+ a class instance that supports the following methods:
+- po.start(filename, url, basename, length, text)
++ po.start(filename, url, basename, size, now, text)
+ # length will be None if unknown
+ po.update(read) # read == bytes read so far
+ po.end()
+
++ multi_progress_obj = None
++
++ a class instance that supports the following methods:
++ mo.start(total_files, total_size)
++ mo.newMeter() => meter
++ mo.removeMeter(meter)
++ mo.end()
++
++ The 'meter' object is similar to progress_obj, but multiple
++ instances may be created and updated at the same time.
++
++ When downloading multiple files in parallel and multi_progress_obj
++ is None progress_obj is used in compatibility mode: finished files
++ are shown but there's no in-progress display.
++
+ text = None
+
+ specifies alternative text to be passed to the progress meter
+@@ -68,14 +83,14 @@ GENERAL ARGUMENTS (kwargs)
(which can be set on default_grabber.throttle) is used. See
BANDWIDTH THROTTLING for more information.
bandwidth = 0
-@@ -198,6 +198,12 @@ GENERAL ARGUMENTS (kwargs)
+@@ -143,8 +158,12 @@ GENERAL ARGUMENTS (kwargs)
+ note that proxy authentication information may be provided using
+ normal URL constructs:
+ proxies={ 'http' : 'http://user:host@foo:3128' }
+- Lastly, if proxies is None, the default environment settings will
+- be used.
++
++ libproxy = False
++
++ Use the libproxy module (if installed) to find proxies.
++ The libproxy code is only used if the proxies dictionary
++ does not provide any proxies.
+
+ prefix = None
+
+@@ -198,6 +217,12 @@ GENERAL ARGUMENTS (kwargs)
control, you should probably subclass URLParser and pass it in via
the 'urlparser' option.
ssl_ca_cert = None
this option can be used if M2Crypto is available and will be
-@@ -248,6 +254,11 @@ GENERAL ARGUMENTS (kwargs)
+@@ -211,43 +236,75 @@ GENERAL ARGUMENTS (kwargs)
+ No-op when using the curl backend (default)
+
+
+- self.ssl_verify_peer = True
++ ssl_verify_peer = True
+
+ Check the server's certificate to make sure it is valid with what our CA validates
+
+- self.ssl_verify_host = True
++ ssl_verify_host = True
+
+ Check the server's hostname to make sure it matches the certificate DN
+
+- self.ssl_key = None
++ ssl_key = None
+
+ Path to the key the client should use to connect/authenticate with
+
+- self.ssl_key_type = 'PEM'
++ ssl_key_type = 'PEM'
+
+ PEM or DER - format of key
+
+- self.ssl_cert = None
++ ssl_cert = None
+
+ Path to the ssl certificate the client should use to to authenticate with
+
+- self.ssl_cert_type = 'PEM'
++ ssl_cert_type = 'PEM'
+
+ PEM or DER - format of certificate
+
+- self.ssl_key_pass = None
++ ssl_key_pass = None
+
+ password to access the ssl_key
+
+- self.size = None
++ size = None
+
+ size (in bytes) or Maximum size of the thing being downloaded.
+ This is mostly to keep us from exploding with an endless datastream
+
+- self.max_header_size = 2097152
++ max_header_size = 2097152
Maximum size (in bytes) of the headers.
-+ self.ip_resolve = 'whatever'
++ ip_resolve = 'whatever'
+
+ What type of name to IP resolving to use, default is to do both IPV4 and
+ IPV6.
++
++ async = (key, limit)
++
++ When this option is set, the urlgrab() is not processed immediately
++ but queued. parallel_wait() then processes grabs in parallel, limiting
++ the numer of connections in each 'key' group to at most 'limit'.
++
++ max_connections
++
++ The global connection limit.
++
++ timedhosts
++
++ The filename of the host download statistics. If defined, urlgrabber
++ will update the stats at the end of every download. At the end of
++ parallel_wait(), the updated stats are saved. If synchronous grabs
++ are used, you should call th_save().
++
++ default_speed, half_life
++
++ These options only affect the async mirror selection code.
++ The default_speed option sets the speed estimate for mirrors
++ we have never downloaded from, and defaults to 1 MBps.
++
++ The speed estimate also drifts exponentially from the speed
++ actually measured to the default speed, with default
++ period of 30 days.
+
RETRY RELATED ARGUMENTS
-@@ -420,6 +431,7 @@ import time
+@@ -328,6 +385,15 @@ RETRY RELATED ARGUMENTS
+ but it cannot (without severe trickiness) prevent the exception
+ from being raised.
+
++ failfunc = None
++
++ The callback that gets called when urlgrab request fails.
++ If defined, urlgrab() calls it instead of raising URLGrabError.
++ Callback syntax is identical to failure_callback.
++
++ Contrary to failure_callback, it's called only once. It's primary
++ purpose is to use urlgrab() without a try/except block.
++
+ interrupt_callback = None
+
+ This callback is called if KeyboardInterrupt is received at any
+@@ -420,6 +486,7 @@ import time
import string
import urllib
import urllib2
import mimetools
import thread
import types
-@@ -439,6 +451,12 @@ try:
+@@ -428,9 +495,17 @@ import pycurl
+ from ftplib import parse150
+ from StringIO import StringIO
+ from httplib import HTTPException
+-import socket
++import socket, select, fcntl
+ from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+
++try:
++ import xattr
++ if not hasattr(xattr, 'set'):
++ xattr = None # This is a "newer" API.
++except ImportError:
++ xattr = None
++
++
+ ########################################################################
+ # MODULE INITIALIZATION
+ ########################################################################
+@@ -439,6 +514,12 @@ try:
except:
__version__ = '???'
########################################################################
# functions for debugging output. These functions are here because they
# are also part of the module initialization.
-@@ -527,6 +545,22 @@ def _(st):
+@@ -504,6 +585,7 @@ def _init_default_logger(logspec=None):
+ else: handler = logging.FileHandler(filename)
+ handler.setFormatter(formatter)
+ DBOBJ = logging.getLogger('urlgrabber')
++ DBOBJ.propagate = False
+ DBOBJ.addHandler(handler)
+ DBOBJ.setLevel(level)
+ except (KeyError, ImportError, ValueError):
+@@ -512,8 +594,8 @@ def _init_default_logger(logspec=None):
+
+ def _log_package_state():
+ if not DEBUG: return
+- DEBUG.info('urlgrabber version = %s' % __version__)
+- DEBUG.info('trans function "_" = %s' % _)
++ DEBUG.debug('urlgrabber version = %s' % __version__)
++ DEBUG.debug('trans function "_" = %s' % _)
+
+ _init_default_logger()
+ _log_package_state()
+@@ -527,6 +609,29 @@ def _(st):
# END MODULE INITIALIZATION
########################################################################
+ obj = obj.encode('utf-8', errors)
+ return obj
+
++def exception2msg(e):
++ try:
++ return str(e)
++ except UnicodeEncodeError:
++ # always use byte strings
++ return unicode(e).encode('utf8')
++
+########################################################################
+# END UTILITY FUNCTIONS
+########################################################################
class URLGrabError(IOError):
-@@ -662,6 +696,7 @@ class URLParser:
+@@ -662,6 +767,7 @@ class URLParser:
opts.quote = 0 --> do not quote it
opts.quote = None --> guess
"""
quote = opts.quote
if opts.prefix:
-@@ -800,6 +835,7 @@ class URLGrabberOptions:
+@@ -768,6 +874,41 @@ class URLGrabberOptions:
+ else: # throttle is a float
+ return self.bandwidth * self.throttle
+
++ def find_proxy(self, url, scheme):
++ """Find the proxy to use for this URL.
++ Use the proxies dictionary first, then libproxy.
++ """
++ self.proxy = None
++ if scheme not in ('ftp', 'http', 'https'):
++ return
++
++ if self.proxies:
++ proxy = self.proxies.get(scheme)
++ if proxy is None:
++ if scheme == 'http':
++ proxy = self.proxies.get('https')
++ elif scheme == 'https':
++ proxy = self.proxies.get('http')
++ if proxy == '_none_':
++ proxy = ''
++ self.proxy = proxy
++ return
++
++ if self.libproxy:
++ global _libproxy_cache
++ if _libproxy_cache is None:
++ try:
++ import libproxy
++ _libproxy_cache = libproxy.ProxyFactory()
++ except:
++ _libproxy_cache = False
++ if _libproxy_cache:
++ for proxy in _libproxy_cache.getProxies(url):
++ if proxy.startswith('http://'):
++ if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url))
++ self.proxy = proxy
++ break
++
+ def derive(self, **kwargs):
+ """Create a derived URLGrabberOptions instance.
+ This method creates a new instance and overrides the
+@@ -791,30 +932,37 @@ class URLGrabberOptions:
+ provided here.
+ """
+ self.progress_obj = None
++ self.multi_progress_obj = None
+ self.throttle = 1.0
+ self.bandwidth = 0
+ self.retry = None
+ self.retrycodes = [-1,2,4,5,6,7]
+ self.checkfunc = None
++ self.failfunc = _do_raise
+ self.copy_local = 0
self.close_connection = 0
self.range = None
self.user_agent = 'urlgrabber/%s' % __version__
+ self.ip_resolve = None
self.keepalive = 1
self.proxies = None
++ self.libproxy = False
++ self.proxy = None
self.reget = None
-@@ -808,13 +844,15 @@ class URLGrabberOptions:
+ self.failure_callback = None
+ self.interrupt_callback = None
self.prefix = None
self.opener = None
self.cache_openers = True
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
self.ssl_context = None # no-op in pycurl
self.ssl_verify_peer = True # check peer's cert for authenticityb
-@@ -846,7 +884,7 @@ class URLGrabberOptions:
+@@ -827,6 +975,12 @@ class URLGrabberOptions:
+ self.size = None # if we know how big the thing we're getting is going
+ # to be. this is ultimately a MAXIMUM size for the file
+ self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
++ self.async = None # blocking by default
++ self.mirror_group = None
++ self.max_connections = 5
++ self.timedhosts = None
++ self.half_life = 30*24*60*60 # 30 days
++ self.default_speed = 1e6 # 1 MBit
+
+ def __repr__(self):
+ return self.format()
+@@ -846,7 +1000,18 @@ class URLGrabberOptions:
s = s + indent + '}'
return s
-class URLGrabber:
++def _do_raise(obj):
++ raise obj.exception
++
++def _run_callback(cb, obj):
++ if not cb:
++ return
++ if callable(cb):
++ return cb(obj)
++ cb, arg, karg = cb
++ return cb(obj, *arg, **karg)
++
+class URLGrabber(object):
"""Provides easy opening of URLs with a variety of options.
All options are specified as kwargs. Options may be specified when
-@@ -931,6 +969,9 @@ class URLGrabber:
+@@ -872,7 +1037,6 @@ class URLGrabber:
+ # beware of infinite loops :)
+ tries = tries + 1
+ exception = None
+- retrycode = None
+ callback = None
+ if DEBUG: DEBUG.info('attempt %i/%s: %s',
+ tries, opts.retry, args[0])
+@@ -883,54 +1047,62 @@ class URLGrabber:
+ except URLGrabError, e:
+ exception = e
+ callback = opts.failure_callback
+- retrycode = e.errno
+ except KeyboardInterrupt, e:
+ exception = e
+ callback = opts.interrupt_callback
++ if not callback:
++ raise
+
+ if DEBUG: DEBUG.info('exception: %s', exception)
+ if callback:
+ if DEBUG: DEBUG.info('calling callback: %s', callback)
+- cb_func, cb_args, cb_kwargs = self._make_callback(callback)
+ obj = CallbackObject(exception=exception, url=args[0],
+ tries=tries, retry=opts.retry)
+- cb_func(obj, *cb_args, **cb_kwargs)
++ _run_callback(callback, obj)
+
+ if (opts.retry is None) or (tries == opts.retry):
+ if DEBUG: DEBUG.info('retries exceeded, re-raising')
+ raise
+
++ retrycode = getattr(exception, 'errno', None)
+ if (retrycode is not None) and (retrycode not in opts.retrycodes):
+ if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
+ retrycode, opts.retrycodes)
+ raise
+
+- def urlopen(self, url, **kwargs):
++ def urlopen(self, url, opts=None, **kwargs):
+ """open the url and return a file object
+ If a progress object or throttle value specified when this
+ object was created, then a special file object will be
+ returned that supports them. The file object can be treated
+ like any other file object.
+ """
+- opts = self.opts.derive(**kwargs)
++ url = _to_utf8(url)
++ opts = (opts or self.opts).derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+ (url,parts) = opts.urlparser.parse(url, opts)
++ opts.find_proxy(url, parts[0])
+ def retryfunc(opts, url):
+ return PyCurlFileObject(url, filename=None, opts=opts)
+ return self._retry(opts, retryfunc, url)
+
+- def urlgrab(self, url, filename=None, **kwargs):
++ def urlgrab(self, url, filename=None, opts=None, **kwargs):
+ """grab the file at <url> and make a local copy at <filename>
+ If filename is none, the basename of the url is used.
+ urlgrab returns the filename of the local file, which may be
+ different from the passed-in filename if copy_local == 0.
+ """
+- opts = self.opts.derive(**kwargs)
++ url = _to_utf8(url)
++ opts = (opts or self.opts).derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+ (url,parts) = opts.urlparser.parse(url, opts)
(scheme, host, path, parm, query, frag) = parts
++ opts.find_proxy(url, scheme)
if filename is None:
filename = os.path.basename( urllib.unquote(path) )
+ if not filename:
if scheme == 'file' and not opts.copy_local:
# just return the name of the local file - don't make a
# copy currently
-@@ -1030,7 +1071,7 @@ class URLGrabber:
+@@ -950,41 +1122,51 @@ class URLGrabber:
+
+ elif not opts.range:
+ if not opts.checkfunc is None:
+- cb_func, cb_args, cb_kwargs = \
+- self._make_callback(opts.checkfunc)
+- obj = CallbackObject()
+- obj.filename = path
+- obj.url = url
+- apply(cb_func, (obj, )+cb_args, cb_kwargs)
++ obj = CallbackObject(filename=path, url=url)
++ _run_callback(opts.checkfunc, obj)
+ return path
+
++ if opts.async:
++ opts.url = url
++ opts.filename = filename
++ opts.size = int(opts.size or 0)
++ _async_queue.append(opts)
++ return filename
++
+ def retryfunc(opts, url, filename):
+ fo = PyCurlFileObject(url, filename, opts)
+ try:
+ fo._do_grab()
++ if fo._tm_last:
++ dlsz = fo._tm_last[0] - fo._tm_first[0]
++ dltm = fo._tm_last[1] - fo._tm_first[1]
++ _TH.update(url, dlsz, dltm, None)
+ if not opts.checkfunc is None:
+- cb_func, cb_args, cb_kwargs = \
+- self._make_callback(opts.checkfunc)
+- obj = CallbackObject()
+- obj.filename = filename
+- obj.url = url
+- apply(cb_func, (obj, )+cb_args, cb_kwargs)
++ obj = CallbackObject(filename=filename, url=url)
++ _run_callback(opts.checkfunc, obj)
+ finally:
+ fo.close()
+ return filename
+
+- return self._retry(opts, retryfunc, url, filename)
++ try:
++ return self._retry(opts, retryfunc, url, filename)
++ except URLGrabError, e:
++ _TH.update(url, 0, 0, e)
++ opts.exception = e
++ return _run_callback(opts.failfunc, opts)
+
+- def urlread(self, url, limit=None, **kwargs):
++ def urlread(self, url, limit=None, opts=None, **kwargs):
+ """read the url into a string, up to 'limit' bytes
+ If the limit is exceeded, an exception will be thrown. Note
+ that urlread is NOT intended to be used as a way of saying
+ "I want the first N bytes" but rather 'read the whole file
+ into memory, but don't use too much'
+ """
+- opts = self.opts.derive(**kwargs)
++ url = _to_utf8(url)
++ opts = (opts or self.opts).derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+ (url,parts) = opts.urlparser.parse(url, opts)
++ opts.find_proxy(url, parts[0])
+ if limit is not None:
+ limit = limit + 1
+
+@@ -1000,12 +1182,8 @@ class URLGrabber:
+ else: s = fo.read(limit)
+
+ if not opts.checkfunc is None:
+- cb_func, cb_args, cb_kwargs = \
+- self._make_callback(opts.checkfunc)
+- obj = CallbackObject()
+- obj.data = s
+- obj.url = url
+- apply(cb_func, (obj, )+cb_args, cb_kwargs)
++ obj = CallbackObject(data=s, url=url)
++ _run_callback(opts.checkfunc, obj)
+ finally:
+ fo.close()
+ return s
+@@ -1020,6 +1198,7 @@ class URLGrabber:
+ return s
+
+ def _make_callback(self, callback_obj):
++ # not used, left for compatibility
+ if callable(callback_obj):
+ return callback_obj, (), {}
+ else:
+@@ -1030,7 +1209,7 @@ class URLGrabber:
default_grabber = URLGrabber()
def __init__(self, url, filename, opts):
self.fo = None
self._hdr_dump = ''
-@@ -1052,9 +1093,15 @@ class PyCurlFileObject():
+@@ -1052,10 +1231,13 @@ class PyCurlFileObject():
self._reget_length = 0
self._prog_running = False
self._error = (None, None)
- self.size = None
+ self.size = 0
+ self._hdr_ended = False
++ self._tm_first = None
++ self._tm_last = None
self._do_open()
+-
+
-+ def geturl(self):
-+ """ Provide the geturl() method, used to be got from
-+ urllib.addinfourl, via. urllib.URLopener.* """
-+ return self.url
-
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
-@@ -1085,9 +1132,14 @@ class PyCurlFileObject():
+ Any attribute not found in _this_ object will be searched for
+@@ -1067,6 +1249,12 @@ class PyCurlFileObject():
+
+ def _retrieve(self, buf):
+ try:
++ tm = self._amount_read + len(buf), time.time()
++ if self._tm_first is None:
++ self._tm_first = tm
++ else:
++ self._tm_last = tm
++
+ if not self._prog_running:
+ if self.opts.progress_obj:
+ size = self.size + self._reget_length
+@@ -1079,15 +1267,24 @@ class PyCurlFileObject():
+ self.opts.progress_obj.update(self._amount_read)
+
+ self._amount_read += len(buf)
+- self.fo.write(buf)
++ try:
++ self.fo.write(buf)
++ except IOError, e:
++ self._cb_error = URLGrabError(16, exception2msg(e))
++ return -1
+ return len(buf)
+ except KeyboardInterrupt:
return -1
def _hdr_retrieve(self, buf):
try:
self._hdr_dump += buf
# we have to get the size before we do the progress obj start
-@@ -1104,7 +1156,17 @@ class PyCurlFileObject():
+@@ -1104,7 +1301,17 @@ class PyCurlFileObject():
s = parse150(buf)
if s:
self.size = int(s)
+
+ if len(self._hdr_dump) != 0 and buf == '\r\n':
+ self._hdr_ended = True
-+ if DEBUG: DEBUG.info('header ended:')
++ if DEBUG: DEBUG.debug('header ended:')
+
return len(buf)
except KeyboardInterrupt:
return pycurl.READFUNC_ABORT
-@@ -1113,8 +1175,10 @@ class PyCurlFileObject():
+@@ -1113,8 +1320,10 @@ class PyCurlFileObject():
if self._parsed_hdr:
return self._parsed_hdr
statusend = self._hdr_dump.find('\n')
self._parsed_hdr = mimetools.Message(hdrfp)
return self._parsed_hdr
-@@ -1136,11 +1200,21 @@ class PyCurlFileObject():
+@@ -1127,6 +1336,9 @@ class PyCurlFileObject():
+ if not opts:
+ opts = self.opts
+
++ # keepalives
++ if not opts.keepalive:
++ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
+
+ # defaults we're always going to set
+ self.curl_obj.setopt(pycurl.NOPROGRESS, False)
+@@ -1136,11 +1348,21 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
- if DEBUG:
+- if DEBUG:
++ if DEBUG and DEBUG.level <= 10:
self.curl_obj.setopt(pycurl.VERBOSE, True)
if opts.user_agent:
self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
# maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
-@@ -1148,9 +1222,11 @@ class PyCurlFileObject():
+@@ -1148,9 +1370,11 @@ class PyCurlFileObject():
# timeouts
timeout = 300
+ if hasattr(opts, 'timeout'):
+ timeout = int(opts.timeout or 0)
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
-+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
++ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1000)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
# ssl options
if self.scheme == 'https':
-@@ -1203,12 +1279,19 @@ class PyCurlFileObject():
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
+@@ -1158,13 +1382,16 @@ class PyCurlFileObject():
+ self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
+ self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
+ self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
+- self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
++ if opts.ssl_verify_host: # 1 is meaningless to curl
++ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2)
+ if opts.ssl_key:
+ self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
+ if opts.ssl_key_type:
+ self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
+ if opts.ssl_cert:
+ self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
++ # if we have a client side cert - turn off reuse b/c nss is odd
++ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
+ if opts.ssl_cert_type:
+ self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
+ if opts.ssl_key_pass:
+@@ -1187,28 +1414,26 @@ class PyCurlFileObject():
+ if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
+ self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
+- # proxy settings
+- if opts.proxies:
+- for (scheme, proxy) in opts.proxies.items():
+- if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
+- if scheme not in ('ftp'):
+- continue
+- else:
+- if proxy == '_none_': proxy = ""
+- self.curl_obj.setopt(pycurl.PROXY, proxy)
+- elif self.scheme in ('http', 'https'):
+- if scheme not in ('http', 'https'):
+- continue
+- else:
+- if proxy == '_none_': proxy = ""
+- self.curl_obj.setopt(pycurl.PROXY, proxy)
+-
- # FIXME username/password/auth settings
++ # proxy
++ if opts.proxy is not None:
++ self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
++ self.curl_obj.setopt(pycurl.PROXYAUTH,
++ # All but Kerberos. BZ 769254
++ pycurl.HTTPAUTH_ANY - pycurl.HTTPAUTH_GSSNEGOTIATE)
++
+ if opts.username and opts.password:
+ if self.scheme in ('http', 'https'):
+ self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
# our url
self.curl_obj.setopt(pycurl.URL, self.url)
-@@ -1228,12 +1311,14 @@ class PyCurlFileObject():
+@@ -1228,39 +1453,36 @@ class PyCurlFileObject():
code = self.http_code
errcode = e.args[0]
if self._error[0]:
errcode = self._error[0]
- if errcode == 23 and code >= 200 and code < 299:
+- if errcode == 23 and code >= 200 and code < 299:
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
- err.url = self.url
-+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
-+ err.url = errurl
-
+-
++ if errcode == 23 and 200 <= code <= 299:
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
-@@ -1244,23 +1329,23 @@ class PyCurlFileObject():
- raise KeyboardInterrupt
+ # which almost always means something aborted it from outside
+ # since we cannot know what it is -I'm banking on it being
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
+ # figure out what aborted the pycurl process FIXME
+- raise KeyboardInterrupt
++ raise getattr(self, '_cb_error', KeyboardInterrupt)
elif errcode == 28:
- err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
elif errcode == 42:
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
- err.url = self.url
-+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
-+ err.url = errurl
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside
-@@ -1272,33 +1357,93 @@ class PyCurlFileObject():
+@@ -1272,33 +1494,94 @@ class PyCurlFileObject():
elif errcode == 58:
msg = _("problem with the local client certificate")
err = URLGrabError(14, msg)
+ err.url = errurl
raise err
- elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
+- elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
++ elif str(e.args[1]) == '' and code and not 200 <= code <= 299:
+ if self.scheme in ['http', 'https']:
+ if self.http_code in responses:
+ resp = responses[self.http_code]
+ 42 : _("Aborted by callback"),
+ 47 : _("Too many redirects"),
+ 51 : _("Peer certificate failed verification"),
++ 52 : _("Got nothing: SSL certificate expired?"),
+ 53 : _("SSL engine not found"),
+ 54 : _("SSL engine set failed"),
+ 55 : _("Network error send()"),
+ else:
+ if self._error[1]:
+ msg = self._error[1]
-+ err = URLGRabError(14, msg)
++ err = URLGrabError(14, msg)
+ err.url = urllib.unquote(self.url)
+ raise err
def _do_open(self):
self.curl_obj = _curl_cache
-@@ -1333,7 +1478,11 @@ class PyCurlFileObject():
+@@ -1333,7 +1616,11 @@ class PyCurlFileObject():
if self.opts.range:
rt = self.opts.range
if rt:
header = range_tuple_to_header(rt)
-@@ -1434,9 +1583,13 @@ class PyCurlFileObject():
+@@ -1434,21 +1721,46 @@ class PyCurlFileObject():
#fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb')
-
- self._do_perform()
-
+-
+-
+ try:
+ self._do_perform()
+ except URLGrabError, e:
+ self.fo.close()
+ raise e
+
-
-
if _was_filename:
-@@ -1446,9 +1599,23 @@ class PyCurlFileObject():
+ # close it up
+ self.fo.flush()
+ self.fo.close()
++
++ # Set the URL where we got it from:
++ if xattr is not None:
++ # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
++ try:
++ xattr.set(self.filename, 'user.xdg.origin.url', self.url)
++ except:
++ pass # URL too long. = IOError ... ignore everything.
++
# set the time
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
if mod_time != -1:
else:
#self.fo = open(self._temp_name, 'r')
self.fo.seek(0)
-@@ -1532,11 +1699,14 @@ class PyCurlFileObject():
+@@ -1526,17 +1838,20 @@ class PyCurlFileObject():
+ if self._prog_running:
+ downloaded += self._reget_length
+ self.opts.progress_obj.update(downloaded)
+- except KeyboardInterrupt:
++ except (KeyboardInterrupt, IOError):
+ return -1
+
def _over_max_size(self, cur, max_size=None):
if not max_size:
msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size)
-@@ -1544,13 +1714,6 @@ class PyCurlFileObject():
+@@ -1544,13 +1859,6 @@ class PyCurlFileObject():
return True
return False
def read(self, amt=None):
self._fill_buffer(amt)
if amt is None:
-@@ -1582,9 +1745,21 @@ class PyCurlFileObject():
+@@ -1582,9 +1890,21 @@ class PyCurlFileObject():
self.opts.progress_obj.end(self._amount_read)
self.fo.close()
+ _curl_cache.close()
+ _curl_cache = pycurl.Curl()
+
-+
++_libproxy_cache = None
+
#####################################################################
# DEPRECATED FUNCTIONS
+@@ -1621,6 +1941,478 @@ def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+
+
+ #####################################################################
++# Serializer + parser: A replacement of the rather bulky Json code.
++#
++# - handles basic python literals, lists and tuples.
++# - serialized strings never contain ' ' or '\n'
++#
++#####################################################################
++
++_quoter_map = {}
++for c in '%[(,)] \n':
++ _quoter_map[c] = '%%%02x' % ord(c)
++del c
++
++def _dumps(v):
++ if v is None: return 'None'
++ if v is True: return 'True'
++ if v is False: return 'False'
++ if type(v) in (int, long, float):
++ return str(v)
++ if type(v) == unicode:
++ v = v.encode('UTF8')
++ if type(v) == str:
++ def quoter(c): return _quoter_map.get(c, c)
++ return "'%s'" % ''.join(map(quoter, v))
++ if type(v) == tuple:
++ return "(%s)" % ','.join(map(_dumps, v))
++ if type(v) == list:
++ return "[%s]" % ','.join(map(_dumps, v))
++ raise TypeError, 'Can\'t serialize %s' % v
++
++def _loads(s):
++ def decode(v):
++ if v == 'None': return None
++ if v == 'True': return True
++ if v == 'False': return False
++ try: return int(v)
++ except ValueError: pass
++ try: return float(v)
++ except ValueError: pass
++ if len(v) >= 2 and v[0] == v[-1] == "'":
++ ret = []; i = 1
++ while True:
++ j = v.find('%', i)
++ ret.append(v[i:j]) # skips the final "'"
++ if j == -1: break
++ ret.append(chr(int(v[j + 1:j + 3], 16)))
++ i = j + 3
++ v = ''.join(ret)
++ return v
++ stk = None
++ l = []
++ i = j = 0
++ while True:
++ if j == len(s) or s[j] in ',)]':
++ if j > i:
++ l.append(decode(s[i:j]))
++ if j == len(s): break
++ if s[j] in ')]':
++ if s[j] == ')':
++ l = tuple(l)
++ stk[0].append(l)
++ l, stk = stk
++ i = j = j + 1
++ elif s[j] in '[(':
++ stk = l, stk
++ l = []
++ i = j = j + 1
++ else:
++ j += 1 # safe because '[(,)]' are quoted
++ if stk: raise ValueError
++ if len(l) == 1: l = l[0]
++ return l
++
++
++#####################################################################
++# External downloader process
++#####################################################################
++
++def _readlines(fd):
++ buf = os.read(fd, 4096)
++ if not buf: return None
++ # whole lines only, no buffering
++ while buf[-1] != '\n':
++ buf += os.read(fd, 4096)
++ return buf[:-1].split('\n')
++
++import subprocess
++
++class _ExternalDownloader:
++ def __init__(self):
++ self.popen = subprocess.Popen(
++ '/usr/libexec/urlgrabber-ext-down',
++ stdin = subprocess.PIPE,
++ stdout = subprocess.PIPE,
++ )
++ self.stdin = self.popen.stdin.fileno()
++ self.stdout = self.popen.stdout.fileno()
++ self.running = {}
++ self.cnt = 0
++
++ # list of options we pass to downloader
++ _options = (
++ 'url', 'filename',
++ 'timeout', 'close_connection', 'keepalive',
++ 'throttle', 'bandwidth', 'range', 'reget',
++ 'user_agent', 'http_headers', 'ftp_headers',
++ 'proxy', 'prefix', 'username', 'password',
++ 'ssl_ca_cert',
++ 'ssl_cert', 'ssl_cert_type',
++ 'ssl_key', 'ssl_key_type',
++ 'ssl_key_pass',
++ 'ssl_verify_peer', 'ssl_verify_host',
++ 'size', 'max_header_size', 'ip_resolve',
++ )
++
++ def start(self, opts):
++ arg = []
++ for k in self._options:
++ v = getattr(opts, k)
++ if v is None: continue
++ arg.append('%s=%s' % (k, _dumps(v)))
++ if opts.progress_obj and opts.multi_progress_obj:
++ arg.append('progress_obj=True')
++ arg = ' '.join(arg)
++ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
++
++ self.cnt += 1
++ self.running[self.cnt] = opts
++ os.write(self.stdin, arg +'\n')
++
++ def perform(self):
++ ret = []
++ lines = _readlines(self.stdout)
++ if not lines:
++ if DEBUG: DEBUG.info('downloader died')
++ raise KeyboardInterrupt
++ for line in lines:
++ # parse downloader output
++ line = line.split(' ', 5)
++ _id, size = map(int, line[:2])
++ if len(line) == 2:
++ self.running[_id]._progress.update(size)
++ continue
++ # job done
++ opts = self.running.pop(_id)
++ if line[4] == 'OK':
++ ug_err = None
++ if DEBUG: DEBUG.info('success')
++ else:
++ ug_err = URLGrabError(int(line[4]), line[5])
++ if DEBUG: DEBUG.info('failure: %s', ug_err)
++ _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async[0])
++ ret.append((opts, size, ug_err))
++ return ret
++
++ def abort(self):
++ self.popen.stdin.close()
++ self.popen.stdout.close()
++ self.popen.wait()
++
++class _ExternalDownloaderPool:
++ def __init__(self):
++ self.epoll = select.epoll()
++ self.running = {}
++ self.cache = {}
++
++ def start(self, opts):
++ host = urlparse.urlsplit(opts.url).netloc
++ dl = self.cache.pop(host, None)
++ if not dl:
++ dl = _ExternalDownloader()
++ fl = fcntl.fcntl(dl.stdin, fcntl.F_GETFD)
++ fcntl.fcntl(dl.stdin, fcntl.F_SETFD, fl | fcntl.FD_CLOEXEC)
++ self.epoll.register(dl.stdout, select.EPOLLIN)
++ self.running[dl.stdout] = dl
++ dl.start(opts)
++
++ def perform(self):
++ ret = []
++ for fd, event in self.epoll.poll():
++ if event & select.EPOLLHUP:
++ if DEBUG: DEBUG.info('downloader died')
++ raise KeyboardInterrupt
++ assert event & select.EPOLLIN
++ done = self.running[fd].perform()
++ if not done: continue
++ assert len(done) == 1
++ ret.extend(done)
++
++ # dl finished, move it to the cache
++ host = urlparse.urlsplit(done[0][0].url).netloc
++ if host in self.cache: self.cache[host].abort()
++ self.epoll.unregister(fd)
++ self.cache[host] = self.running.pop(fd)
++ return ret
++
++ def abort(self):
++ for dl in self.running.values():
++ self.epoll.unregister(dl.stdout)
++ dl.abort()
++ for dl in self.cache.values():
++ dl.abort()
++
++
++#####################################################################
++# High level async API
++#####################################################################
++
++_async_queue = []
++
++def parallel_wait(meter=None):
++ '''Process queued requests in parallel.
++ '''
++
++ # calculate total sizes
++ meters = {}
++ for opts in _async_queue:
++ if opts.progress_obj and opts.multi_progress_obj:
++ count, total = meters.get(opts.multi_progress_obj) or (0, 0)
++ meters[opts.multi_progress_obj] = count + 1, total + opts.size
++
++ # start multi-file meters
++ for meter in meters:
++ count, total = meters[meter]
++ meter.start(count, total)
++
++ dl = _ExternalDownloaderPool()
++ host_con = {} # current host connection counts
++
++ def start(opts, tries):
++ opts.tries = tries
++ try:
++ dl.start(opts)
++ except OSError, e:
++ # can't spawn downloader, give up immediately
++ opts.exception = URLGrabError(5, exception2msg(e))
++ _run_callback(opts.failfunc, opts)
++ return
++
++ key, limit = opts.async
++ host_con[key] = host_con.get(key, 0) + 1
++ if opts.progress_obj:
++ if opts.multi_progress_obj:
++ opts._progress = opts.multi_progress_obj.newMeter()
++ opts._progress.start(text=opts.text)
++ else:
++ opts._progress = time.time() # no updates
++
++ def perform():
++ for opts, size, ug_err in dl.perform():
++ key, limit = opts.async
++ host_con[key] -= 1
++
++ if ug_err is None:
++ if opts.checkfunc:
++ try: _run_callback(opts.checkfunc, opts)
++ except URLGrabError, ug_err: pass
++
++ if opts.progress_obj:
++ if opts.multi_progress_obj:
++ if ug_err:
++ opts._progress.failure(None)
++ else:
++ opts.multi_progress_obj.re.total += size - opts.size # correct totals
++ opts._progress.end(size)
++ opts.multi_progress_obj.removeMeter(opts._progress)
++ else:
++ opts.progress_obj.start(text=opts.text, now=opts._progress)
++ opts.progress_obj.update(size)
++ opts.progress_obj.end(size)
++ del opts._progress
++
++ if ug_err is None:
++ continue
++
++ retry = opts.retry or 0
++ if opts.failure_callback:
++ opts.exception = ug_err
++ try: _run_callback(opts.failure_callback, opts)
++ except URLGrabError, ug_err:
++ retry = 0 # no retries
++ if opts.tries < retry and ug_err.errno in opts.retrycodes:
++ start(opts, opts.tries + 1) # simple retry
++ continue
++
++ if opts.mirror_group:
++ mg, errors, failed, removed = opts.mirror_group
++ errors.append((opts.url, exception2msg(ug_err)))
++ failed[key] = failed.get(key, 0) + 1
++ opts.mirror = key
++ opts.exception = ug_err
++ action = mg.default_action or {}
++ if mg.failure_callback:
++ opts.tries = len(errors)
++ action = dict(action) # update only the copy
++ action.update(_run_callback(mg.failure_callback, opts))
++ if not action.get('fail', 0):
++ # mask this mirror and retry
++ if action.get('remove', 1):
++ removed.add(key)
++ _async_queue.append(opts)
++ continue
++ # fail=1 from callback
++ ug_err.errors = errors
++
++ # urlgrab failed
++ opts.exception = ug_err
++ _run_callback(opts.failfunc, opts)
++
++ try:
++ idx = 0
++ while True:
++ if idx >= len(_async_queue):
++ # the queue is empty
++ if not dl.running: break
++ # pending dl may extend it
++ perform()
++ continue
++
++ # handle next request
++ opts = _async_queue[idx]
++ idx += 1
++
++ # check global limit
++ while len(dl.running) >= default_grabber.opts.max_connections:
++ perform()
++ if DEBUG:
++ DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections)
++
++ if opts.mirror_group:
++ mg, errors, failed, removed = opts.mirror_group
++
++ # find the best mirror
++ best = None
++ best_speed = None
++ for mirror in mg.mirrors:
++ key = mirror['mirror']
++ if key in removed: continue
++
++ # estimate mirror speed
++ speed, fail = _TH.estimate(key)
++ speed /= 1 + host_con.get(key, 0)
++
++ # order by: least failures, private flag, best speed
++ # ignore 'private' flag if there were failures
++ private = not fail and mirror.get('kwargs', {}).get('private', False)
++ speed = -failed.get(key, 0), private, speed
++ if best is None or speed > best_speed:
++ best = mirror
++ best_speed = speed
++
++ if best is None:
++ opts.exception = URLGrabError(256, _('No more mirrors to try.'))
++ opts.exception.errors = errors
++ _run_callback(opts.failfunc, opts)
++ continue
++
++ # update the grabber object, apply mirror kwargs
++ grabber = best.get('grabber') or mg.grabber
++ opts.delegate = grabber.opts.derive(**best.get('kwargs', {}))
++
++ # update the current mirror and limit
++ key = best['mirror']
++ limit = best.get('kwargs', {}).get('max_connections', 2)
++ opts.async = key, limit
++
++ # update URL and proxy
++ url = mg._join_url(key, opts.relative_url)
++ url, parts = opts.urlparser.parse(url, opts)
++ opts.find_proxy(url, parts[0])
++ opts.url = url
++
++ # check host limit, then start
++ key, limit = opts.async
++ while host_con.get(key, 0) >= limit:
++ perform()
++ if DEBUG:
++ DEBUG.info('max_connections(%s): %d/%d', key, host_con.get(key, 0), limit)
++
++ start(opts, 1)
++ except IOError, e:
++ if e.errno != 4: raise
++ raise KeyboardInterrupt
++
++ finally:
++ dl.abort()
++ for meter in meters:
++ meter.end()
++ del _async_queue[:]
++ _TH.save()
++
++
++#####################################################################
++# Host bandwidth estimation
++#####################################################################
++
++class _TH:
++ hosts = {}
++ dirty = None
++
++ @staticmethod
++ def load():
++ filename = default_grabber.opts.timedhosts
++ if filename and _TH.dirty is None:
++ try:
++ for line in open(filename):
++ host, speed, fail, ts = line.rsplit(' ', 3)
++ _TH.hosts[host] = int(speed), int(fail), int(ts)
++ except IOError: pass
++ _TH.dirty = False
++
++ @staticmethod
++ def save():
++ filename = default_grabber.opts.timedhosts
++ if filename and _TH.dirty is True:
++ tmp = '%s.%d' % (filename, os.getpid())
++ try:
++ f = open(tmp, 'w')
++ for host in _TH.hosts:
++ f.write(host + ' %d %d %d\n' % _TH.hosts[host])
++ f.close()
++ os.rename(tmp, filename)
++ except IOError: pass
++ _TH.dirty = False
++
++ @staticmethod
++ def update(url, dl_size, dl_time, ug_err, baseurl=None):
++ # Use hostname from URL. If it's a file:// URL, use baseurl.
++ # If no baseurl, do not update timedhosts.
++ host = urlparse.urlsplit(url).netloc.split('@')[-1] or baseurl
++ if not host: return
++
++ _TH.load()
++ speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0)
++ now = time.time()
++
++ if ug_err is None:
++ # defer first update if the file was small. BZ 851178.
++ if not ts and dl_size < 1e6: return
++
++ # k1: the older, the less useful
++ # k2: <500ms readings are less reliable
++ # speeds vary, use 10:1 smoothing
++ k1 = 2**((ts - now) / default_grabber.opts.half_life)
++ k2 = min(dl_time / .500, 1.0) / 10
++ if k2 > 0:
++ speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2)
++ fail = 0
++ elif getattr(ug_err, 'code', None) == 404:
++ fail = 0 # alive, at least
++ else:
++ fail += 1 # seems dead
++
++ _TH.hosts[host] = speed, fail, now
++ _TH.dirty = True
++
++ @staticmethod
++ def estimate(baseurl):
++ _TH.load()
++
++ # Use just the hostname, unless it's a file:// baseurl.
++ host = urlparse.urlsplit(baseurl).netloc.split('@')[-1] or baseurl
++
++ default_speed = default_grabber.opts.default_speed
++ try: speed, fail, ts = _TH.hosts[host]
++ except KeyError: return default_speed, 0
++
++ speed *= 2**-fail
++ k = 2**((ts - time.time()) / default_grabber.opts.half_life)
++ speed = k * speed + (1 - k) * default_speed
++ return speed, fail
++
++#####################################################################
+ # TESTING
+ def _main_test():
+ try: url, filename = sys.argv[1:3]
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
-index dad410b..8731aed 100644
+index dad410b..7975f1b 100644
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
-@@ -90,7 +90,7 @@ CUSTOMIZATION
+@@ -76,6 +76,9 @@ CUSTOMIZATION
+ 'grabber' is omitted, the default grabber will be used. If
+ kwargs are omitted, then (duh) they will not be used.
+
++ kwarg 'max_connections' limits the number of concurrent
++ connections to this mirror.
++
+ 3) Pass keyword arguments when instantiating the mirror group.
+ See, for example, the failure_callback argument.
+
+@@ -87,10 +90,14 @@ CUSTOMIZATION
+ """
+
+
++import sys
import random
import thread # needed for locking to make this threadsafe
-from grabber import URLGrabError, CallbackObject, DEBUG
+from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
++from grabber import _run_callback, _do_raise
++from grabber import exception2msg
++from grabber import _TH
def _(st):
return st
-@@ -263,7 +263,8 @@ class MirrorGroup:
+@@ -126,7 +133,9 @@ class MirrorGroup:
+ files)
+
+ * if the local list is ever exhausted, a URLGrabError will be
+- raised (errno=256, no more mirrors)
++ raised (errno=256, No more mirrors). The 'errors' attribute
++ holds a list of (full_url, errmsg) tuples. This contains
++ all URLs tried and the corresponding error messages.
+
+ OPTIONS
+
+@@ -153,7 +162,8 @@ class MirrorGroup:
+
+ The 'fail' option will cause immediate failure by re-raising
+ the exception and no further attempts to get the current
+- download.
++ download. As in the "No more mirrors" case, the 'errors'
++ attribute is set in the exception object.
+
+ This dict can be set at instantiation time,
+ mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
+@@ -184,6 +194,7 @@ class MirrorGroup:
+
+ obj.exception = < exception that was raised >
+ obj.mirror = < the mirror that was tried >
++ obj.tries = < the number of mirror tries so far >
+ obj.relative_url = < url relative to the mirror >
+ obj.url = < full url that failed >
+ # .url is just the combination of .mirror
+@@ -251,6 +262,17 @@ class MirrorGroup:
+ self.default_action = None
+ self._process_kwargs(kwargs)
+
++ # use the same algorithm as parallel downloader to initially sort
++ # the mirror list (sort by speed, but prefer live private mirrors)
++ def estimate(m):
++ speed, fail = _TH.estimate(m['mirror'])
++ private = not fail and m.get('kwargs', {}).get('private', False)
++ return private, speed
++
++ # update the initial order. since sorting is stable, the relative
++ # order of unknown (not used yet) hosts is retained.
++ self.mirrors.sort(key=estimate, reverse=True)
++
+ # if these values are found in **kwargs passed to one of the urlXXX
+ # methods, they will be stripped before getting passed on to the
+ # grabber
+@@ -263,7 +285,8 @@ class MirrorGroup:
def _parse_mirrors(self, mirrors):
parsed_mirrors = []
for m in mirrors:
parsed_mirrors.append(m)
return parsed_mirrors
+@@ -280,7 +303,9 @@ class MirrorGroup:
+ # return a random mirror so that multiple mirrors get used
+ # even without failures.
+ if not gr.mirrors:
+- raise URLGrabError(256, _('No more mirrors to try.'))
++ e = URLGrabError(256, _('No more mirrors to try.'))
++ e.errors = gr.errors
++ raise e
+ return gr.mirrors[gr._next]
+
+ def _failure(self, gr, cb_obj):
+@@ -307,7 +332,9 @@ class MirrorGroup:
+ a.update(action)
+ action = a
+ self.increment_mirror(gr, action)
+- if action and action.get('fail', 0): raise
++ if action and action.get('fail', 0):
++ sys.exc_info()[1].errors = gr.errors
++ raise
+
+ def increment_mirror(self, gr, action={}):
+ """Tell the mirror object increment the mirror index
+@@ -377,35 +404,50 @@ class MirrorGroup:
+ gr.url = url
+ gr.kw = dict(kw)
+ self._load_gr(gr)
++ gr.errors = []
+
+ for k in self.options:
+ try: del kw[k]
+ except KeyError: pass
+
++ tries = 0
+ while 1:
++ tries += 1
+ mirrorchoice = self._get_mirror(gr)
+ fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
+- kwargs = dict(mirrorchoice.get('kwargs', {}))
+- kwargs.update(kw)
+ grabber = mirrorchoice.get('grabber') or self.grabber
++ # apply mirrorchoice kwargs on top of grabber.opts
++ opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {}))
+ func_ref = getattr(grabber, func)
+ if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
+ try:
+- return func_ref( *(fullurl,), **kwargs )
++ return func_ref( *(fullurl,), opts=opts, **kw )
+ except URLGrabError, e:
+ if DEBUG: DEBUG.info('MIRROR: failed')
++ gr.errors.append((fullurl, exception2msg(e)))
+ obj = CallbackObject()
+ obj.exception = e
+ obj.mirror = mirrorchoice['mirror']
+ obj.relative_url = gr.url
+ obj.url = fullurl
++ obj.tries = tries
+ self._failure(gr, obj)
+
+ def urlgrab(self, url, filename=None, **kwargs):
+ kw = dict(kwargs)
+ kw['filename'] = filename
++ if kw.get('async'):
++ # enable mirror failovers in async path
++ kw['mirror_group'] = self, [], {}, set()
++ kw['relative_url'] = url
++ else:
++ kw.pop('failfunc', None)
+ func = 'urlgrab'
+- return self._mirror_try(func, url, kw)
++ try:
++ return self._mirror_try(func, url, kw)
++ except URLGrabError, e:
++ obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs)
++ return _run_callback(kwargs.get('failfunc', _do_raise), obj)
+
+ def urlopen(self, url, **kwargs):
+ kw = dict(kwargs)
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
-index dd07c6a..45eb248 100644
+index dd07c6a..077fd99 100644
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
-@@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0):
+@@ -133,8 +133,8 @@ class BaseMeter:
+ # for a real gui, you probably want to override and put a call
+ # to your mainloop iteration function here
+ if now is None: now = time.time()
+- if (now >= self.last_update_time + self.update_period) or \
+- not self.last_update_time:
++ if (not self.last_update_time or
++ (now >= self.last_update_time + self.update_period)):
+ self.re.update(amount_read, now)
+ self.last_amount_read = amount_read
+ self.last_update_time = now
+@@ -211,6 +211,21 @@ def text_meter_total_size(size, downloaded=0):
+ # 4. + ( 5, total: 32)
+ #
+
++def _term_add_bar(tl, bar_max_length, pc):
++ blen = bar_max_length
++ bar = '='*int(blen * pc)
++ if (blen * pc) - int(blen * pc) >= 0.5:
++ bar += '-'
++ return tl.add(' [%-*.*s]' % (blen, blen, bar))
++
++def _term_add_end(tl, osize, size):
++ if osize is not None:
++ if size > osize: # Is ??? better? Really need something to say < vs >.
++ return tl.add(' !!! '), True
++ elif size != osize:
++ return tl.add(' ... '), True
++ return tl.add(' ' * 5), False
++
+ class TextMeter(BaseMeter):
+ def __init__(self, fo=sys.stderr):
+ BaseMeter.__init__(self)
+@@ -218,7 +233,6 @@ class TextMeter(BaseMeter):
+
+ def _do_update(self, amount_read, now=None):
+ etime = self.re.elapsed_time()
+- fetime = format_time(etime)
+ fread = format_number(amount_read)
+ #self.size = None
+ if self.text is not None:
+@@ -234,16 +248,20 @@ class TextMeter(BaseMeter):
+
+ # Include text + ui_rate in minimal
+ tl = TerminalLine(8, 8+1+8)
++ if tl._llen > 80:
++ use_hours = True # For big screens, make it more readable.
++ else:
++ use_hours = False
+ ui_size = tl.add(' | %5sB' % fread)
+ if self.size is None:
+- ui_time = tl.add(' %9s' % fetime)
++ ui_time = tl.add(' %9s' % format_time(etime, use_hours))
+ ui_end = tl.add(' ' * 5)
+ ui_rate = tl.add(' %5sB/s' % ave_dl)
+ out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_rate, ui_size, ui_time, ui_end)
+ else:
+ rtime = self.re.remaining_time()
+- frtime = format_time(rtime)
++ frtime = format_time(rtime, use_hours)
+ frac = self.re.fraction_read()
+
+ ui_time = tl.add(' %9s' % frtime)
+@@ -259,13 +277,10 @@ class TextMeter(BaseMeter):
+ ui_rate = tl.add(' %5sB/s' % ave_dl)
+ # Make text grow a bit before we start growing the bar too
+ blen = 4 + tl.rest_split(8 + 8 + 4)
+- bar = '='*int(blen * frac)
+- if (blen * frac) - int(blen * frac) >= 0.5:
+- bar += '-'
+- ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar))
+- out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+- ui_sofar_pc, ui_pc, ui_bar,
+- ui_rate, ui_size, ui_time, ui_end)
++ ui_bar = _term_add_bar(tl, blen, frac)
++ out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
++ ui_sofar_pc, ui_pc, ui_bar,
++ ui_rate,ui_size,ui_time, ui_end)
+
+ self.fo.write(out)
+ self.fo.flush()
+@@ -274,7 +289,6 @@ class TextMeter(BaseMeter):
+ global _text_meter_total_size
+ global _text_meter_sofar_size
+
+- total_time = format_time(self.re.elapsed_time())
+ total_size = format_number(amount_read)
+ if self.text is not None:
+ text = self.text
+@@ -282,14 +296,13 @@ class TextMeter(BaseMeter):
+ text = self.basename
+
+ tl = TerminalLine(8)
+- ui_size = tl.add(' | %5sB' % total_size)
+- ui_time = tl.add(' %9s' % total_time)
+- not_done = self.size is not None and amount_read != self.size
+- if not_done:
+- ui_end = tl.add(' ... ')
++ if tl._llen > 80:
++ use_hours = True # For big screens, make it more readable.
+ else:
+- ui_end = tl.add(' ' * 5)
+-
++ use_hours = False
++ ui_size = tl.add(' | %5sB' % total_size)
++ ui_time = tl.add(' %9s' % format_time(self.re.elapsed_time(),use_hours))
++ ui_end, not_done = _term_add_end(tl, self.size, amount_read)
+ out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
+ ui_size, ui_time, ui_end)
+ self.fo.write(out)
+@@ -331,12 +344,21 @@ class MultiFileHelper(BaseMeter):
+ def message(self, message):
+ self.master.message_meter(self, message)
+
++class _FakeLock:
++ def acquire(self):
++ pass
++ def release(self):
++ pass
++
+ class MultiFileMeter:
+ helperclass = MultiFileHelper
+- def __init__(self):
++ def __init__(self, threaded=True):
+ self.meters = []
+ self.in_progress_meters = []
+- self._lock = thread.allocate_lock()
++ if threaded:
++ self._lock = thread.allocate_lock()
++ else:
++ self._lock = _FakeLock()
+ self.update_period = 0.3 # seconds
+
+ self.numfiles = None
+@@ -369,6 +391,7 @@ class MultiFileMeter:
+
+ def end(self, now=None):
+ if now is None: now = time.time()
++ self.re.update(self._amount_read(), now)
+ self._do_end(now)
+
+ def _do_end(self, now):
+@@ -407,8 +430,8 @@ class MultiFileMeter:
+ def update_meter(self, meter, now):
+ if not meter in self.meters:
+ raise ValueError('attempt to use orphaned meter')
+- if (now >= self.last_update_time + self.update_period) or \
+- not self.last_update_time:
++ if (not self.last_update_time or
++ (now >= self.last_update_time + self.update_period)):
+ self.re.update(self._amount_read(), now)
+ self.last_update_time = now
+ self._do_update_meter(meter, now)
+@@ -466,34 +489,87 @@ class MultiFileMeter:
+
+
+ class TextMultiFileMeter(MultiFileMeter):
+- def __init__(self, fo=sys.stderr):
++ def __init__(self, fo=sys.stderr, threaded=True):
+ self.fo = fo
+- MultiFileMeter.__init__(self)
++ MultiFileMeter.__init__(self, threaded)
++ self.index_time = self.index = 0
+
+ # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
++# New output, like TextMeter output...
++# update: No size (minimal: 17 chars)
++# -----------------------------------
++# (<#file>/<#tot files>): <text> <rate> | <current size> <elapsed>
++# 8-48 1 8 3 6 1 7-9 5
++#
++# update: Size, All files
++# -----------------------
++# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA
++# 8-22 1 3-4 1 6-12 1 8 3 6 1 7-9 1 3 1
++# end
++# ---
++# <text> | <file size> <file elapsed time>
++# 8-56 3 6 1 9 5
+ def _do_update_meter(self, meter, now):
+ self._lock.acquire()
+ try:
+- format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \
+- "time: %8.8s/%8.8s"
+ df = self.finished_files
+ tf = self.numfiles or 1
+- pf = 100 * float(df)/tf + 0.49
++ # Don't use "percent of files complete" ...
++ # pf = 100 * float(df)/tf + 0.49
+ dd = self.re.last_amount_read
+- td = self.total_size
++ td = self.re.total
+ pd = 100 * (self.re.fraction_read() or 0) + 0.49
+ dt = self.re.elapsed_time()
+ rt = self.re.remaining_time()
+- if rt is None: tt = None
+- else: tt = dt + rt
+
+- fdd = format_number(dd) + 'B'
+- ftd = format_number(td) + 'B'
+- fdt = format_time(dt, 1)
+- ftt = format_time(tt, 1)
+-
+- out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
+- self.fo.write('\r' + out)
++ frac = self.re.fraction_read() or 0
++ pf = 100 * frac
++ ave_dl = format_number(self.re.average_rate())
++
++ # cycle through active meters
++ if now > self.index_time:
++ self.index_time = now + 1.0
++ self.index += 1
++ if self.index >= len(self.meters):
++ self.index = 0
++ meter = self.meters[self.index]
++ text = meter.text or meter.basename
++ if tf > 1:
++ text = '(%u/%u): %s' % (df+1+self.index, tf, text)
++
++ # Include text + ui_rate in minimal
++ tl = TerminalLine(8, 8+1+8)
++ if tl._llen > 80:
++ use_hours = True # For big screens, make it more readable.
++ time_len = 9
++ else:
++ use_hours = False
++ time_len = 7
++
++ ui_size = tl.add(' | %5sB' % format_number(dd))
++
++ if not self.re.total:
++ ui_time = tl.add(' %*s' % (time_len,format_time(dt, use_hours)))
++ ui_end = tl.add(' ' * 5)
++ ui_rate = tl.add(' %5sB/s' % ave_dl)
++ out = '\r%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
++ ui_rate, ui_size, ui_time, ui_end)
++ else:
++ ui_time = tl.add(' %*s' % (time_len,format_time(rt, use_hours)))
++ ui_end = tl.add(' ETA ')
++
++ ui_sofar_pc = tl.add(' %i%%' % pf,
++ full_len=len(" (100%)"))
++ ui_rate = tl.add(' %5sB/s' % ave_dl)
++
++ # Make text grow a bit before we start growing the bar too
++ blen = 4 + tl.rest_split(8 + 8 + 4)
++ ui_bar = _term_add_bar(tl, blen, frac)
++ out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
++ ui_sofar_pc, ui_bar,
++ ui_rate, ui_size, ui_time,
++ ui_end)
++ self.fo.write(out)
+ self.fo.flush()
+ finally:
+ self._lock.release()
+@@ -502,24 +578,40 @@ class TextMultiFileMeter(MultiFileMeter):
+ self._lock.acquire()
+ try:
+ format = "%-30.30s %6.6s %8.8s %9.9s"
+- fn = meter.basename
++ fn = meter.text or meter.basename
+ size = meter.last_amount_read
+ fsize = format_number(size) + 'B'
+ et = meter.re.elapsed_time()
+- fet = format_time(et, 1)
+- frate = format_number(size / et) + 'B/s'
+-
+- out = '%-79.79s' % (format % (fn, fsize, fet, frate))
+- self.fo.write('\r' + out + '\n')
++ frate = format_number(et and size / et) + 'B/s'
++ df = self.finished_files
++ tf = self.numfiles or 1
++
++ total_size = format_number(size)
++ text = meter.text or meter.basename
++ if tf > 1:
++ text = '(%u/%u): %s' % (df, tf, text)
++
++ tl = TerminalLine(8)
++ if tl._llen > 80:
++ use_hours = True # For big screens, make it more readable.
++ time_len = 9
++ else:
++ use_hours = False
++ time_len = 7
++ ui_size = tl.add(' | %5sB' % total_size)
++ ui_time = tl.add(' %*s' % (time_len, format_time(et, use_hours)))
++ ui_end, not_done = _term_add_end(tl, meter.size, size)
++ out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
++ ui_size, ui_time, ui_end)
++ self.fo.write(out)
+ finally:
+ self._lock.release()
+- self._do_update_meter(meter, now)
+
+ def _do_failure_meter(self, meter, message, now):
+ self._lock.acquire()
+ try:
+ format = "%-30.30s %6.6s %s"
+- fn = meter.basename
++ fn = meter.text or meter.basename
+ if type(message) in (type(''), type(u'')):
+ message = message.splitlines()
+ if not message: message = ['']
+@@ -536,15 +628,6 @@ class TextMultiFileMeter(MultiFileMeter):
+ pass
+ finally:
+ self._lock.release()
+-
+- def _do_end(self, now):
+- self._do_update_meter(None, now)
+- self._lock.acquire()
+- try:
+- self.fo.write('\n')
+- self.fo.flush()
+- finally:
+- self._lock.release()
+
+ ######################################################################
+ # support classes and functions
+@@ -658,6 +741,8 @@ def format_time(seconds, use_hours=0):
if seconds is None or seconds < 0:
if use_hours: return '--:--:--'
else: return '--:--'
else:
seconds = int(seconds)
minutes = seconds / 60
+@@ -722,9 +807,77 @@ def _tst(fn, cur, tot, beg, size, *args):
+ time.sleep(delay)
+ tm.end(size)
+
++def _mtst(datas, *args):
++ print '-' * 79
++ tm = TextMultiFileMeter(threaded=False)
++
++ dl_sizes = {}
++
++ num = 0
++ total_size = 0
++ dl_total_size = 0
++ for data in datas:
++ dl_size = None
++ if len(data) == 2:
++ fn, size = data
++ dl_size = size
++ if len(data) == 3:
++ fn, size, dl_size = data
++ nm = tm.newMeter()
++ nm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size,
++ text=fn)
++ num += 1
++ assert dl_size is not None
++ dl_total_size += dl_size
++ dl_sizes[nm] = dl_size
++ if size is None or total_size is None:
++ total_size = None
++ else:
++ total_size += size
++ tm.start(num, total_size)
++
++ num = 0
++ off = 0
++ for (inc, delay) in args:
++ off += 1
++ while num < ((dl_total_size * off) / len(args)):
++ num += inc
++ for nm in tm.meters[:]:
++ if dl_sizes[nm] <= num:
++ nm.end(dl_sizes[nm])
++ tm.removeMeter(nm)
++ else:
++ nm.update(num)
++ time.sleep(delay)
++ assert not tm.meters
++
+ if __name__ == "__main__":
+ # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28
+ # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08
++ if len(sys.argv) >= 2 and sys.argv[1] == 'multi':
++ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
++ ("s-1.0.1-1.fc8.i386.rpm", 5000),
++ ("m-1.0.1-2.fc8.i386.rpm", 10000)),
++ (100, 0.33), (500, 0.25), (1000, 0.1))
++
++ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
++ ("s-1.0.1-1.fc8.i386.rpm", 5000),
++ ("m-1.0.1-2.fc8.i386.rpm", None, 10000)),
++ (100, 0.33), (500, 0.25), (1000, 0.1))
++
++ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
++ ("s-1.0.1-1.fc8.i386.rpm", 2500000),
++ ("m-1.0.1-2.fc8.i386.rpm", 10000)),
++ (10, 0.2), (50, 0.1), (1000, 0.1))
++
++ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
++ ("s-1.0.1-1.fc8.i386.rpm", None, 2500000),
++ ("m-1.0.1-2.fc8.i386.rpm", None, 10000)),
++ (10, 0.2), (50, 0.1), (1000, 0.1))
++ # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
++ # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
++ sys.exit(0)
++
+ if len(sys.argv) >= 2 and sys.argv[1] == 'total':
+ text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 +
+ 1000000 + 10000 + 10000 + 10000 + 1000000)