Return complete lines from codec stream readers

author Martin v. Löwis <martin@v.loewis.de>

Wed, 24 Aug 2005 07:38:36 +0000 (07:38 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Wed, 24 Aug 2005 07:38:36 +0000 (07:38 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Wed, 24 Aug 2005 07:38:36 +0000 (07:38 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Wed, 24 Aug 2005 07:38:36 +0000 (07:38 +0000)
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex

index d98f474e93206f03719cfa865121dfa4200dbd17..7e22386814c27087cfa91736585218f3d75a12f4 100644 (file)
--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -394,7 +394,7 @@ order to be compatible to the Python codec registry.
    be extended with \function{register_error()}.
  \end{classdesc}
  
-\begin{methoddesc}{read}{\optional{size\optional{, chars}}}
+\begin{methoddesc}{read}{\optional{size\optional{, chars, \optional{firstline}}}}
    Decodes data from the stream and returns the resulting object.
  
    \var{chars} indicates the number of characters to read from the
@@ -408,12 +408,16 @@ order to be compatible to the Python codec registry.
    decode as much as possible.  \var{size} is intended to prevent having
    to decode huge files in one step.
  
+  \var{firstline} indicates that it would be sufficient to only return
+  the first line, if there are decoding errors on later lines.
+
    The method should use a greedy read strategy meaning that it should
    read as much data as is allowed within the definition of the encoding
    and the given size, e.g.  if optional encoding endings or state
    markers are available on the stream, these should be read too.
  
    \versionchanged[\var{chars} argument added]{2.4}
+  \versionchanged[\var{firstline} argument added]{2.4.2}
  \end{methoddesc}
  
  \begin{methoddesc}{readline}{\optional{size\optional{, keepends}}}
diff --git a/Lib/codecs.py b/Lib/codecs.py

index 33b7481e918f1c4acb53dbee237acfcead3460bb..761cc0645166dd02e984968c0ea98f96e254159e 100644 (file)
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -236,7 +236,7 @@ class StreamReader(Codec):
      def decode(self, input, errors='strict'):
          raise NotImplementedError
  
-    def read(self, size=-1, chars=-1):
+    def read(self, size=-1, chars=-1, firstline=False):
  
          """ Decodes data from the stream self.stream and returns the
              resulting object.
@@ -253,6 +253,11 @@ class StreamReader(Codec):
              is intended to prevent having to decode huge files in one
              step.
  
+            If firstline is true, and a UnicodeDecodeError happens
+            after the first line terminator in the input only the first line
+            will be returned, the rest of the input will be kept until the
+            next call to read().
+
              The method should use a greedy read strategy meaning that
              it should read as much data as is allowed within the
              definition of the encoding and the given size, e.g.  if
@@ -275,7 +280,16 @@ class StreamReader(Codec):
                  newdata = self.stream.read(size)
              # decode bytes (those remaining from the last call included)
              data = self.bytebuffer + newdata
-            newchars, decodedbytes = self.decode(data, self.errors)
+            try:
+                newchars, decodedbytes = self.decode(data, self.errors)
+            except UnicodeDecodeError, exc:
+                if firstline:
+                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
+                    lines = newchars.splitlines(True)
+                    if len(lines)<=1:
+                        raise
+                else:
+                    raise
              # keep undecoded bytes until the next call
              self.bytebuffer = data[decodedbytes:]
              # put new characters in the character buffer
@@ -306,7 +320,7 @@ class StreamReader(Codec):
          line = ""
          # If size is given, we call read() only once
          while True:
-            data = self.read(readsize)
+            data = self.read(readsize, firstline=True)
              if data:
                  # If we're at a "\r" read one extra character (which might
                  # be a "\n") to get a proper line ending. If the stream is
diff --git a/Misc/NEWS b/Misc/NEWS

index 48a7d89b2c2494ebcb1c83724b468f8c3ca3828c..1626d6a7cb6b85b6b57214d2373063029586ffe3 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -66,6 +66,10 @@ Extension Modules
  Library
  -------
  
+- Bug #1178484: Return complete lines from codec stream readers
+  even if there is an exception in later lines, resulting in
+  correct line numbers for decoding errors in source code. 
+
  - Bug #1266283: "lexists" is now in os.path.__all__.
  
  - The sets module can now properly compute s-=s and s^=s as an empty set.
author	Martin v. Löwis <martin@v.loewis.de>
	Wed, 24 Aug 2005 07:38:36 +0000 (07:38 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Wed, 24 Aug 2005 07:38:36 +0000 (07:38 +0000)
Doc/lib/libcodecs.tex		patch \| blob \| blame \| history
Lib/codecs.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history