Patch [ 1062060 ] fix for 1016880 urllib.urlretrieve silently truncates dwnld

author Georg Brandl <georg@python.org>

Wed, 24 Aug 2005 18:46:39 +0000 (18:46 +0000)

committer Georg Brandl <georg@python.org>

Wed, 24 Aug 2005 18:46:39 +0000 (18:46 +0000)
author Georg Brandl <georg@python.org>
Wed, 24 Aug 2005 18:46:39 +0000 (18:46 +0000)
committer Georg Brandl <georg@python.org>
Wed, 24 Aug 2005 18:46:39 +0000 (18:46 +0000)
diff --git a/Doc/lib/liburllib.tex b/Doc/lib/liburllib.tex

index dafdd917c540ffca5896d8345b56b63af0109c01..5e488c40971129b32cc6be719c85684a4b5e6af6 100644 (file)
--- a/Doc/lib/liburllib.tex
+++ b/Doc/lib/liburllib.tex
@@ -142,6 +142,25 @@ If the \var{url} uses the \file{http:} scheme identifier, the optional
  (normally the request type is \code{GET}).  The \var{data} argument
  must in standard \mimetype{application/x-www-form-urlencoded} format;
  see the \function{urlencode()} function below.
+
+\versionchanged[
+\function{urlretrieve()} will raise \exception{ContentTooShortError}
+when it detects that the amount of data available 
+was less than the expected amount (which is the size reported by a 
+\var{Content-Length} header). This can occur, for example, when the 
+download is interrupted.
+
+The \var{Content-Length} is treated as a lower bound: if there's more data 
+to read, urlretrieve reads more data, but if less data is available, 
+it raises the exception.
+
+You can still retrieve the downloaded data in this case, it is stored 
+in the \member{content} attribute of the exception instance.
+
+If no \var{Content-Length} header was supplied, urlretrieve can
+not check the size of the data it has downloaded, and just returns it. 
+In this case you just have to assume that the download was successful]{2.5}
+
  \end{funcdesc}
  
  \begin{datadesc}{_urlopener}
@@ -283,6 +302,15 @@ subclass may override this method to support more appropriate behavior
  if needed.}
  \end{classdesc}
  
+\begin{excclassdesc}{ContentTooShortError}{msg\optional{, content}}
+This exception is raised when the \function{urlretrieve()} function
+detects that the amount of the downloaded data is less than the 
+expected amount (given by the \var{Content-Length} header). The
+\member{content} attribute stores the downloaded (and supposedly
+truncated) data.
+\versionadded{2.5}
+\end{excclassdesc}
+
  Restrictions:
  
  \begin{itemize}
@@ -317,7 +345,7 @@ Web client using these functions without using threads.
  \item
  The data returned by \function{urlopen()} or \function{urlretrieve()}
  is the raw data returned by the server.  This may be binary data
-(e.g. an image), plain text or (for example) HTML\index{HTML}.  The
+(such as an image), plain text or (for example) HTML\index{HTML}.  The
  HTTP\indexii{HTTP}{protocol} protocol provides type information in the
  reply header, which can be inspected by looking at the
  \mailheader{Content-Type} header.  For the
diff --git a/Lib/urllib.py b/Lib/urllib.py

index 74b2aec79e86843fbdf98b62081ccc36ec35a329..4f1ebdd21ed81208ea48c25f2ec8a7b68c0196e7 100644 (file)
--- a/Lib/urllib.py
+++ b/Lib/urllib.py
@@ -86,6 +86,11 @@ def urlcleanup():
      if _urlopener:
          _urlopener.cleanup()
  
+# exception raised when downloaded size does not match content-length
+class ContentTooShortError(IOError):
+    def __init__(self, message, content):
+        IOError.__init__(self, message)
+        self.content = content
  
  ftpcache = {}
  class URLopener:
@@ -228,24 +233,33 @@ class URLopener:
              self.tempcache[url] = result
          bs = 1024*8
          size = -1
+        read = 0
          blocknum = 1
          if reporthook:
              if "content-length" in headers:
                  size = int(headers["Content-Length"])
              reporthook(0, bs, size)
          block = fp.read(bs)
+        read += len(block)
          if reporthook:
              reporthook(1, bs, size)
          while block:
              tfp.write(block)
              block = fp.read(bs)
-            blocknum = blocknum + 1
+            read += len(block)
+            blocknum += 1
              if reporthook:
                  reporthook(blocknum, bs, size)
          fp.close()
          tfp.close()
          del fp
          del tfp
+
+        # raise exception if actual size does not match content-length header
+        if size >= 0 and read < size:
+            raise ContentTooShortError("retrieval incomplete: got only %i out "
+                                       "of %i bytes" % (read, size), result)
+
          return result
  
      # Each method named open_<type> knows how to open that type of URL
diff --git a/Misc/NEWS b/Misc/NEWS

index 7e21b7a50c3a26e62edc943e9d7962ae88017e53..fab6163a58a8977f798e93b1994386aa76a1f8ee 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -193,6 +193,10 @@ Extension Modules
  Library
  -------
  
+- Patch #1062060: urllib.urlretrieve() now raises a new exception, named
+  ContentTooShortException, when the actually downloaded size does not
+  match the Content-Length header.
+
  - Bug #1121494: distutils.dir_utils.mkpath now accepts Unicode strings.
  
  - Bug #1178484: Return complete lines from codec stream readers
author	Georg Brandl <georg@python.org>
	Wed, 24 Aug 2005 18:46:39 +0000 (18:46 +0000)
committer	Georg Brandl <georg@python.org>
	Wed, 24 Aug 2005 18:46:39 +0000 (18:46 +0000)
Doc/lib/liburllib.tex		patch \| blob \| blame \| history
Lib/urllib.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history