python-urlgrabber/patches/urlgrabber-HEAD.patch

   1 diff --git a/scripts/urlgrabber b/scripts/urlgrabber
   2 index 518e512..09cd896 100644
   3 --- a/scripts/urlgrabber
   4 +++ b/scripts/urlgrabber
   5 @@ -115,6 +115,7 @@ options:
   6                      including quotes in the case of strings.
   7                      e.g.  --user_agent='"foobar/2.0"'
   8
   9 +  --output FILE
  10    -o FILE           write output to FILE, otherwise the basename of the
  11                      url will be used
  12    -O                print the names of saved files to STDOUT
  13 @@ -170,12 +171,17 @@ class client_options:
  14          return ug_options, ug_defaults
  15
  16      def process_command_line(self):
  17 -        short_options = 'vd:hoOpD'
  18 +        short_options = 'vd:ho:OpD'
  19          long_options = ['profile', 'repeat=', 'verbose=',
  20 -                        'debug=', 'help', 'progress']
  21 +                        'debug=', 'help', 'progress', 'output=']
  22          ug_long = [ o + '=' for o in self.ug_options ]
  23 -        optlist, args = getopt.getopt(sys.argv[1:], short_options,
  24 -                                      long_options + ug_long)
  25 +        try:
  26 +            optlist, args = getopt.getopt(sys.argv[1:], short_options,
  27 +                                          long_options + ug_long)
  28 +        except getopt.GetoptError, e:
  29 +            print >>sys.stderr, "Error:", e
  30 +            self.help([], ret=1)
  31 +
  32          self.verbose = 0
  33          self.debug = None
  34          self.outputfile = None
  35 @@ -193,6 +199,7 @@ class client_options:
  36              if o == '--verbose': self.verbose = v
  37              if o == '-v':        self.verbose += 1
  38              if o == '-o':        self.outputfile = v
  39 +            if o == '--output':  self.outputfile = v
  40              if o == '-p' or o == '--progress': self.progress = 1
  41              if o == '-d' or o == '--debug': self.debug = v
  42              if o == '--profile': self.profile = 1
  43 @@ -222,7 +229,7 @@ class client_options:
  44              print "ERROR: cannot use -o when grabbing multiple files"
  45              sys.exit(1)
  46
  47 -    def help(self, args):
  48 +    def help(self, args, ret=0):
  49          if not args:
  50              print MAINHELP
  51          else:
  52 @@ -234,7 +241,7 @@ class client_options:
  53                      self.help_ug_option(a)
  54                  else:
  55                      print 'ERROR: no help on command "%s"' % a
  56 -        sys.exit(0)
  57 +        sys.exit(ret)
  58
  59      def help_doc(self):
  60          print __doc__
  61 diff --git a/test/base_test_code.py b/test/base_test_code.py
  62 index 50c6348..5fb43f9 100644
  63 --- a/test/base_test_code.py
  64 +++ b/test/base_test_code.py
  65 @@ -1,6 +1,6 @@
  66  from munittest import *
  67
  68 -base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
  69 +base_http = 'http://urlgrabber.baseurl.org/test/'
  70  base_ftp  = 'ftp://localhost/test/'
  71
  72  # set to a proftp server only. we're working around a couple of
  73 diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
  74 index 3e5f3b7..8eeaeda 100644
  75 --- a/urlgrabber/byterange.py
  76 +++ b/urlgrabber/byterange.py
  77 @@ -68,7 +68,7 @@ class HTTPRangeHandler(urllib2.BaseHandler):
  78
  79      def http_error_416(self, req, fp, code, msg, hdrs):
  80          # HTTP's Range Not Satisfiable error
  81 -        raise RangeError('Requested Range Not Satisfiable')
  82 +        raise RangeError(9, 'Requested Range Not Satisfiable')
  83
  84  class HTTPSRangeHandler(HTTPRangeHandler):
  85      """ Range Header support for HTTPS. """
  86 @@ -208,7 +208,7 @@ class RangeableFileObject:
  87                  bufsize = offset - pos
  88              buf = self.fo.read(bufsize)
  89              if len(buf) != bufsize:
  90 -                raise RangeError('Requested Range Not Satisfiable')
  91 +                raise RangeError(9, 'Requested Range Not Satisfiable')
  92              pos+= bufsize
  93
  94  class FileRangeHandler(urllib2.FileHandler):
  95 @@ -238,7 +238,7 @@ class FileRangeHandler(urllib2.FileHandler):
  96              (fb,lb) = brange
  97              if lb == '': lb = size
  98              if fb < 0 or fb > size or lb > size:
  99 -                raise RangeError('Requested Range Not Satisfiable')
 100 +                raise RangeError(9, 'Requested Range Not Satisfiable')
 101              size = (lb - fb)
 102              fo = RangeableFileObject(fo, (fb,lb))
 103          headers = mimetools.Message(StringIO(
 104 @@ -318,12 +318,12 @@ class FTPRangeHandler(urllib2.FTPHandler):
 105                  (fb,lb) = range_tup
 106                  if lb == '':
 107                      if retrlen is None or retrlen == 0:
 108 -                        raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
 109 +                        raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.')
 110                      lb = retrlen
 111                      retrlen = lb - fb
 112                      if retrlen < 0:
 113                          # beginning of range is larger than file
 114 -                        raise RangeError('Requested Range Not Satisfiable')
 115 +                        raise RangeError(9, 'Requested Range Not Satisfiable')
 116                  else:
 117                      retrlen = lb - fb
 118                      fp = RangeableFileObject(fp, (0,retrlen))
 119 @@ -458,6 +458,6 @@ def range_tuple_normalize(range_tup):
 120      # check if range is over the entire file
 121      if (fb,lb) == (0,''): return None
 122      # check that the range is valid
 123 -    if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
 124 +    if lb < fb: raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
 125      return (fb,lb)
 126
 127 diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
 128 index e090e90..b2770c5 100644
 129 --- a/urlgrabber/grabber.py
 130 +++ b/urlgrabber/grabber.py
 131 @@ -68,14 +68,14 @@ GENERAL ARGUMENTS (kwargs)
 132      (which can be set on default_grabber.throttle) is used. See
 133      BANDWIDTH THROTTLING for more information.
 134
 135 -  timeout = None
 136 +  timeout = 300
 137
 138 -    a positive float expressing the number of seconds to wait for socket
 139 -    operations. If the value is None or 0.0, socket operations will block
 140 -    forever. Setting this option causes urlgrabber to call the settimeout
 141 -    method on the Socket object used for the request. See the Python
 142 -    documentation on settimeout for more information.
 143 -    http://www.python.org/doc/current/lib/socket-objects.html
 144 +    a positive integer expressing the number of seconds to wait before
 145 +    timing out attempts to connect to a server. If the value is None
 146 +    or 0, connection attempts will not time out. The timeout is passed
 147 +    to the underlying pycurl object as its CONNECTTIMEOUT option, see
 148 +    the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
 149 +    http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
 150
 151    bandwidth = 0
 152
 153 @@ -198,6 +198,12 @@ GENERAL ARGUMENTS (kwargs)
 154      control, you should probably subclass URLParser and pass it in via
 155      the 'urlparser' option.
 156
 157 +  username = None
 158 +    username to use for simple http auth - is automatically quoted for special characters
 159 +
 160 +  password = None
 161 +    password to use for simple http auth - is automatically quoted for special characters
 162 +
 163    ssl_ca_cert = None
 164
 165      this option can be used if M2Crypto is available and will be
 166 @@ -248,6 +254,11 @@ GENERAL ARGUMENTS (kwargs)
 167
 168      Maximum size (in bytes) of the headers.
 169
 170 +  self.ip_resolve = 'whatever'
 171 +
 172 +    What type of name to IP resolving to use, default is to do both IPV4 and
 173 +    IPV6.
 174 +
 175
 176  RETRY RELATED ARGUMENTS
 177
 178 @@ -420,6 +431,7 @@ import time
 179  import string
 180  import urllib
 181  import urllib2
 182 +from httplib import responses
 183  import mimetools
 184  import thread
 185  import types
 186 @@ -439,6 +451,12 @@ try:
 187  except:
 188      __version__ = '???'
 189
 190 +try:
 191 +    # this part isn't going to do much - need to talk to gettext
 192 +    from i18n import _
 193 +except ImportError, msg:
 194 +    def _(st): return st
 195 +
 196  ########################################################################
 197  # functions for debugging output.  These functions are here because they
 198  # are also part of the module initialization.
 199 @@ -527,6 +545,22 @@ def _(st):
 200  #                 END MODULE INITIALIZATION
 201  ########################################################################
 202
 203 +########################################################################
 204 +#                 UTILITY FUNCTIONS
 205 +########################################################################
 206 +
 207 +# These functions are meant to be utilities for the urlgrabber library to use.
 208 +
 209 +def _to_utf8(obj, errors='replace'):
 210 +    '''convert 'unicode' to an encoded utf-8 byte string '''
 211 +    # stolen from yum.i18n
 212 +    if isinstance(obj, unicode):
 213 +        obj = obj.encode('utf-8', errors)
 214 +    return obj
 215 +
 216 +########################################################################
 217 +#                 END UTILITY FUNCTIONS
 218 +########################################################################
 219
 220
 221  class URLGrabError(IOError):
 222 @@ -662,6 +696,7 @@ class URLParser:
 223            opts.quote = 0     --> do not quote it
 224            opts.quote = None  --> guess
 225          """
 226 +        url = _to_utf8(url)
 227          quote = opts.quote
 228
 229          if opts.prefix:
 230 @@ -800,6 +835,7 @@ class URLGrabberOptions:
 231          self.close_connection = 0
 232          self.range = None
 233          self.user_agent = 'urlgrabber/%s' % __version__
 234 +        self.ip_resolve = None
 235          self.keepalive = 1
 236          self.proxies = None
 237          self.reget = None
 238 @@ -808,13 +844,15 @@ class URLGrabberOptions:
 239          self.prefix = None
 240          self.opener = None
 241          self.cache_openers = True
 242 -        self.timeout = None
 243 +        self.timeout = 300
 244          self.text = None
 245          self.http_headers = None
 246          self.ftp_headers = None
 247          self.data = None
 248          self.urlparser = URLParser()
 249          self.quote = None
 250 +        self.username = None
 251 +        self.password = None
 252          self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
 253          self.ssl_context = None # no-op in pycurl
 254          self.ssl_verify_peer = True # check peer's cert for authenticityb
 255 @@ -846,7 +884,7 @@ class URLGrabberOptions:
 256          s = s + indent + '}'
 257          return s
 258
 259 -class URLGrabber:
 260 +class URLGrabber(object):
 261      """Provides easy opening of URLs with a variety of options.
 262
 263      All options are specified as kwargs. Options may be specified when
 264 @@ -931,6 +969,9 @@ class URLGrabber:
 265          (scheme, host, path, parm, query, frag) = parts
 266          if filename is None:
 267              filename = os.path.basename( urllib.unquote(path) )
 268 +            if not filename:
 269 +                # This is better than nothing.
 270 +                filename = 'index.html'
 271          if scheme == 'file' and not opts.copy_local:
 272              # just return the name of the local file - don't make a
 273              # copy currently
 274 @@ -1030,7 +1071,7 @@ class URLGrabber:
 275  default_grabber = URLGrabber()
 276
 277
 278 -class PyCurlFileObject():
 279 +class PyCurlFileObject(object):
 280      def __init__(self, url, filename, opts):
 281          self.fo = None
 282          self._hdr_dump = ''
 283 @@ -1052,9 +1093,15 @@ class PyCurlFileObject():
 284          self._reget_length = 0
 285          self._prog_running = False
 286          self._error = (None, None)
 287 -        self.size = None
 288 +        self.size = 0
 289 +        self._hdr_ended = False
 290          self._do_open()
 291
 292 +
 293 +    def geturl(self):
 294 +        """ Provide the geturl() method, used to be got from
 295 +            urllib.addinfourl, via. urllib.URLopener.* """
 296 +        return self.url
 297
 298      def __getattr__(self, name):
 299          """This effectively allows us to wrap at the instance level.
 300 @@ -1085,9 +1132,14 @@ class PyCurlFileObject():
 301              return -1
 302
 303      def _hdr_retrieve(self, buf):
 304 +        if self._hdr_ended:
 305 +            self._hdr_dump = ''
 306 +            self.size = 0
 307 +            self._hdr_ended = False
 308 +
 309          if self._over_max_size(cur=len(self._hdr_dump),
 310                                 max_size=self.opts.max_header_size):
 311 -            return -1
 312 +            return -1
 313          try:
 314              self._hdr_dump += buf
 315              # we have to get the size before we do the progress obj start
 316 @@ -1104,7 +1156,17 @@ class PyCurlFileObject():
 317                      s = parse150(buf)
 318                  if s:
 319                      self.size = int(s)
 320 -
 321 +
 322 +            if buf.lower().find('location') != -1:
 323 +                location = ':'.join(buf.split(':')[1:])
 324 +                location = location.strip()
 325 +                self.scheme = urlparse.urlsplit(location)[0]
 326 +                self.url = location
 327 +
 328 +            if len(self._hdr_dump) != 0 and buf == '\r\n':
 329 +                self._hdr_ended = True
 330 +                if DEBUG: DEBUG.info('header ended:')
 331 +
 332              return len(buf)
 333          except KeyboardInterrupt:
 334              return pycurl.READFUNC_ABORT
 335 @@ -1113,8 +1175,10 @@ class PyCurlFileObject():
 336          if self._parsed_hdr:
 337              return self._parsed_hdr
 338          statusend = self._hdr_dump.find('\n')
 339 +        statusend += 1 # ridiculous as it may seem.
 340          hdrfp = StringIO()
 341          hdrfp.write(self._hdr_dump[statusend:])
 342 +        hdrfp.seek(0)
 343          self._parsed_hdr =  mimetools.Message(hdrfp)
 344          return self._parsed_hdr
 345
 346 @@ -1136,11 +1200,21 @@ class PyCurlFileObject():
 347          self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
 348          self.curl_obj.setopt(pycurl.FAILONERROR, True)
 349          self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
 350 +        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
 351
 352          if DEBUG:
 353              self.curl_obj.setopt(pycurl.VERBOSE, True)
 354          if opts.user_agent:
 355              self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
 356 +        if opts.ip_resolve:
 357 +            # Default is: IPRESOLVE_WHATEVER
 358 +            ipr = opts.ip_resolve.lower()
 359 +            if ipr == 'whatever': # Do we need this?
 360 +                self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER)
 361 +            if ipr == 'ipv4':
 362 +                self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
 363 +            if ipr == 'ipv6':
 364 +                self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6)
 365
 366          # maybe to be options later
 367          self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
 368 @@ -1148,9 +1222,11 @@ class PyCurlFileObject():
 369
 370          # timeouts
 371          timeout = 300
 372 -        if opts.timeout:
 373 -            timeout = int(opts.timeout)
 374 -            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
 375 +        if hasattr(opts, 'timeout'):
 376 +            timeout = int(opts.timeout or 0)
 377 +        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
 378 +        self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
 379 +        self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
 380
 381          # ssl options
 382          if self.scheme == 'https':
 383 @@ -1203,12 +1279,19 @@ class PyCurlFileObject():
 384                          if proxy == '_none_': proxy = ""
 385                          self.curl_obj.setopt(pycurl.PROXY, proxy)
 386
 387 -        # FIXME username/password/auth settings
 388 +        if opts.username and opts.password:
 389 +            if self.scheme in ('http', 'https'):
 390 +                self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
 391 +
 392 +            if opts.username and opts.password:
 393 +                # apparently when applying them as curlopts they do not require quoting of any kind
 394 +                userpwd = '%s:%s' % (opts.username, opts.password)
 395 +                self.curl_obj.setopt(pycurl.USERPWD, userpwd)
 396
 397          #posts - simple - expects the fields as they are
 398          if opts.data:
 399              self.curl_obj.setopt(pycurl.POST, True)
 400 -            self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
 401 +            self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data))
 402
 403          # our url
 404          self.curl_obj.setopt(pycurl.URL, self.url)
 405 @@ -1228,12 +1311,14 @@ class PyCurlFileObject():
 406
 407              code = self.http_code
 408              errcode = e.args[0]
 409 +            errurl = urllib.unquote(self.url)
 410 +
 411              if self._error[0]:
 412                  errcode = self._error[0]
 413
 414              if errcode == 23 and code >= 200 and code < 299:
 415 -                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
 416 -                err.url = self.url
 417 +                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
 418 +                err.url = errurl
 419
 420                  # this is probably wrong but ultimately this is what happens
 421                  # we have a legit http code and a pycurl 'writer failed' code
 422 @@ -1244,23 +1329,23 @@ class PyCurlFileObject():
 423                  raise KeyboardInterrupt
 424
 425              elif errcode == 28:
 426 -                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
 427 -                err.url = self.url
 428 +                err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e))
 429 +                err.url = errurl
 430                  raise err
 431              elif errcode == 35:
 432                  msg = _("problem making ssl connection")
 433                  err = URLGrabError(14, msg)
 434 -                err.url = self.url
 435 +                err.url = errurl
 436                  raise err
 437              elif errcode == 37:
 438 -                msg = _("Could not open/read %s") % (self.url)
 439 +                msg = _("Could not open/read %s") % (errurl)
 440                  err = URLGrabError(14, msg)
 441 -                err.url = self.url
 442 +                err.url = errurl
 443                  raise err
 444
 445              elif errcode == 42:
 446 -                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
 447 -                err.url = self.url
 448 +                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (errurl, e))
 449 +                err.url = errurl
 450                  # this is probably wrong but ultimately this is what happens
 451                  # we have a legit http code and a pycurl 'writer failed' code
 452                  # which almost always means something aborted it from outside
 453 @@ -1272,33 +1357,93 @@ class PyCurlFileObject():
 454              elif errcode == 58:
 455                  msg = _("problem with the local client certificate")
 456                  err = URLGrabError(14, msg)
 457 -                err.url = self.url
 458 +                err.url = errurl
 459                  raise err
 460
 461              elif errcode == 60:
 462 -                msg = _("client cert cannot be verified or client cert incorrect")
 463 +                msg = _("Peer cert cannot be verified or peer cert invalid")
 464                  err = URLGrabError(14, msg)
 465 -                err.url = self.url
 466 +                err.url = errurl
 467                  raise err
 468
 469              elif errcode == 63:
 470                  if self._error[1]:
 471                      msg = self._error[1]
 472                  else:
 473 -                    msg = _("Max download size exceeded on %s") % (self.url)
 474 +                    msg = _("Max download size exceeded on %s") % ()
 475                  err = URLGrabError(14, msg)
 476 -                err.url = self.url
 477 +                err.url = errurl
 478                  raise err
 479
 480              elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
 481 -                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
 482 +                if self.scheme in ['http', 'https']:
 483 +                    if self.http_code in responses:
 484 +                        resp = responses[self.http_code]
 485 +                        msg = 'HTTP Error %s - %s : %s' % (self.http_code, resp, errurl)
 486 +                    else:
 487 +                        msg = 'HTTP Error %s : %s ' % (self.http_code, errurl)
 488 +                elif self.scheme in ['ftp']:
 489 +                    msg = 'FTP Error %s : %s ' % (self.http_code, errurl)
 490 +                else:
 491 +                    msg = "Unknown Error: URL=%s , scheme=%s" % (errurl, self.scheme)
 492              else:
 493 -                msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
 494 +                pyerr2str = { 5 : _("Couldn't resolve proxy"),
 495 +                              6 : _("Couldn't resolve host"),
 496 +                              7 : _("Couldn't connect"),
 497 +                              8 : _("Bad reply to FTP server"),
 498 +                              9 : _("Access denied"),
 499 +                             11 : _("Bad reply to FTP pass"),
 500 +                             13 : _("Bad reply to FTP pasv"),
 501 +                             14 : _("Bad reply to FTP 227"),
 502 +                             15 : _("Couldn't get FTP host"),
 503 +                             17 : _("Couldn't set FTP type"),
 504 +                             18 : _("Partial file"),
 505 +                             19 : _("FTP RETR command failed"),
 506 +                             22 : _("HTTP returned error"),
 507 +                             23 : _("Write error"),
 508 +                             25 : _("Upload failed"),
 509 +                             26 : _("Read error"),
 510 +                             27 : _("Out of Memory"),
 511 +                             28 : _("Operation timed out"),
 512 +                             30 : _("FTP PORT command failed"),
 513 +                             31 : _("FTP REST command failed"),
 514 +                             33 : _("Range failed"),
 515 +                             34 : _("HTTP POST failed"),
 516 +                             35 : _("SSL CONNECT failed"),
 517 +                             36 : _("Couldn't resume download"),
 518 +                             37 : _("Couldn't read file"),
 519 +                             42 : _("Aborted by callback"),
 520 +                             47 : _("Too many redirects"),
 521 +                             51 : _("Peer certificate failed verification"),
 522 +                             53 : _("SSL engine not found"),
 523 +                             54 : _("SSL engine set failed"),
 524 +                             55 : _("Network error send()"),
 525 +                             56 : _("Network error recv()"),
 526 +                             58 : _("Local certificate failed"),
 527 +                             59 : _("SSL set cipher failed"),
 528 +                             60 : _("Local CA certificate failed"),
 529 +                             61 : _("HTTP bad transfer encoding"),
 530 +                             63 : _("Maximum file size exceeded"),
 531 +                             64 : _("FTP SSL failed"),
 532 +                             67 : _("Authentication failure"),
 533 +                             70 : _("Out of disk space on server"),
 534 +                             73 : _("Remove file exists"),
 535 +                              }
 536 +                errstr = str(e.args[1])
 537 +                if not errstr:
 538 +                    errstr = pyerr2str.get(errcode, '<Unknown>')
 539 +                msg = 'curl#%s - "%s"' % (errcode, errstr)
 540                  code = errcode
 541              err = URLGrabError(14, msg)
 542              err.code = code
 543              err.exception = e
 544              raise err
 545 +        else:
 546 +            if self._error[1]:
 547 +                msg = self._error[1]
 548 +                err = URLGRabError(14, msg)
 549 +                err.url = urllib.unquote(self.url)
 550 +                raise err
 551
 552      def _do_open(self):
 553          self.curl_obj = _curl_cache
 554 @@ -1333,7 +1478,11 @@ class PyCurlFileObject():
 555
 556          if self.opts.range:
 557              rt = self.opts.range
 558 -            if rt[0]: rt = (rt[0] + reget_length, rt[1])
 559 +
 560 +            if rt[0] is None:
 561 +                rt = (0, rt[1])
 562 +            rt = (rt[0] + reget_length, rt[1])
 563 +
 564
 565          if rt:
 566              header = range_tuple_to_header(rt)
 567 @@ -1434,9 +1583,13 @@ class PyCurlFileObject():
 568              #fh, self._temp_name = mkstemp()
 569              #self.fo = open(self._temp_name, 'wb')
 570
 571 -
 572 -        self._do_perform()
 573 -
 574 +        try:
 575 +            self._do_perform()
 576 +        except URLGrabError, e:
 577 +            self.fo.flush()
 578 +            self.fo.close()
 579 +            raise e
 580 +
 581
 582
 583          if _was_filename:
 584 @@ -1446,9 +1599,23 @@ class PyCurlFileObject():
 585              # set the time
 586              mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
 587              if mod_time != -1:
 588 -                os.utime(self.filename, (mod_time, mod_time))
 589 +                try:
 590 +                    os.utime(self.filename, (mod_time, mod_time))
 591 +                except OSError, e:
 592 +                    err = URLGrabError(16, _(\
 593 +                      'error setting timestamp on file %s from %s, OSError: %s')
 594 +                              % (self.filename, self.url, e))
 595 +                    err.url = self.url
 596 +                    raise err
 597              # re open it
 598 -            self.fo = open(self.filename, 'r')
 599 +            try:
 600 +                self.fo = open(self.filename, 'r')
 601 +            except IOError, e:
 602 +                err = URLGrabError(16, _(\
 603 +                  'error opening file from %s, IOError: %s') % (self.url, e))
 604 +                err.url = self.url
 605 +                raise err
 606 +
 607          else:
 608              #self.fo = open(self._temp_name, 'r')
 609              self.fo.seek(0)
 610 @@ -1532,11 +1699,14 @@ class PyCurlFileObject():
 611      def _over_max_size(self, cur, max_size=None):
 612
 613          if not max_size:
 614 -            max_size = self.size
 615 -        if self.opts.size: # if we set an opts size use that, no matter what
 616 -            max_size = self.opts.size
 617 +            if not self.opts.size:
 618 +                max_size = self.size
 619 +            else:
 620 +                max_size = self.opts.size
 621 +
 622          if not max_size: return False # if we have None for all of the Max then this is dumb
 623 -        if cur > max_size + max_size*.10:
 624 +
 625 +        if cur > int(float(max_size) * 1.10):
 626
 627              msg = _("Downloaded more than max size for %s: %s > %s") \
 628                          % (self.url, cur, max_size)
 629 @@ -1544,13 +1714,6 @@ class PyCurlFileObject():
 630              return True
 631          return False
 632
 633 -    def _to_utf8(self, obj, errors='replace'):
 634 -        '''convert 'unicode' to an encoded utf-8 byte string '''
 635 -        # stolen from yum.i18n
 636 -        if isinstance(obj, unicode):
 637 -            obj = obj.encode('utf-8', errors)
 638 -        return obj
 639 -
 640      def read(self, amt=None):
 641          self._fill_buffer(amt)
 642          if amt is None:
 643 @@ -1582,9 +1745,21 @@ class PyCurlFileObject():
 644              self.opts.progress_obj.end(self._amount_read)
 645          self.fo.close()
 646
 647 -
 648 +    def geturl(self):
 649 +        """ Provide the geturl() method, used to be got from
 650 +            urllib.addinfourl, via. urllib.URLopener.* """
 651 +        return self.url
 652 +
 653  _curl_cache = pycurl.Curl() # make one and reuse it over and over and over
 654
 655 +def reset_curl_obj():
 656 +    """To make sure curl has reread the network/dns info we force a reload"""
 657 +    global _curl_cache
 658 +    _curl_cache.close()
 659 +    _curl_cache = pycurl.Curl()
 660 +
 661 +
 662 +
 663
 664  #####################################################################
 665  # DEPRECATED FUNCTIONS
 666 diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
 667 index dad410b..8731aed 100644
 668 --- a/urlgrabber/mirror.py
 669 +++ b/urlgrabber/mirror.py
 670 @@ -90,7 +90,7 @@ CUSTOMIZATION
 671  import random
 672  import thread  # needed for locking to make this threadsafe
 673
 674 -from grabber import URLGrabError, CallbackObject, DEBUG
 675 +from grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
 676
 677  def _(st):
 678      return st
 679 @@ -263,7 +263,8 @@ class MirrorGroup:
 680      def _parse_mirrors(self, mirrors):
 681          parsed_mirrors = []
 682          for m in mirrors:
 683 -            if type(m) == type(''): m = {'mirror': m}
 684 +            if isinstance(m, basestring):
 685 +                m = {'mirror': _to_utf8(m)}
 686              parsed_mirrors.append(m)
 687          return parsed_mirrors
 688
 689 diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
 690 index dd07c6a..45eb248 100644
 691 --- a/urlgrabber/progress.py
 692 +++ b/urlgrabber/progress.py
 693 @@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0):
 694      if seconds is None or seconds < 0:
 695          if use_hours: return '--:--:--'
 696          else:         return '--:--'
 697 +    elif seconds == float('inf'):
 698 +        return 'Infinite'
 699      else:
 700          seconds = int(seconds)
 701          minutes = seconds / 60